Spaces:
Runtime error
Runtime error
File size: 5,043 Bytes
1d37909 9bedcbe 96f46f4 9bedcbe 96f46f4 9bedcbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!pip install gradio
import gradio as gr
def read_pdf(pdf_path):
# create a PDF file object
pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object
pdfReader = PyPDF2.PdfReader(pdfFileObj)
# Create the dictionary to extract text from each page
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
# Initialize the variables needed for the text extraction from the page
pageObj = pdfReader.pages[pagenum]
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# Initialize the number of the examined tables
table_num = 0
first_element= True
table_extraction_flag= False
# Open the pdf file
pdf = pdfplumber.open(pdf_path)
# Find the examined page
page_tables = pdf.pages[pagenum]
# Find the number of tables on the page
tables = page_tables.find_tables()
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the elements as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i, component in enumerate(page_elements):
# Extract the position of the top side of the element in the PDF
pos = component[0]
# Extract the element of the page layout
element = component[1]
# Check if the element is a text element
if isinstance(element, LTTextContainer):
# Check if the text appeared in a table
if table_extraction_flag == False:
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
else:
# Omit the text that appeared in a table
pass
# Create the key of the dictionary
dctkey = 'Page_'+str(pagenum)
# Add the list of list as the value of the page key
text_per_page[dctkey] = [page_text, line_format, text_from_images, text_from_tables, page_content]
# Closing the pdf file object
pdfFileObj.close()
return text_per_page
pdf_path = '/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf'
text_per_page = read_pdf(pdf_path)
Page_0 = text_per_page['Page_0']
def nested_list_to_string(nested_list):
result = ''
for element in nested_list:
if isinstance(element, list): # Check if the element is a list
result += nested_list_to_string(element) # Recursively process the list
elif isinstance(element, str): # Check if the element is a string
result += element # Append the string to the result
return result
Page_0 = text_per_page['Page_0']
string_result = nested_list_to_string(Page_0)
def extract_abstract(page_0):
def nested_list_to_string(nested_list):
result = ''
for element in nested_list:
if isinstance(element, list): # Check if the element is a list
result += nested_list_to_string(element) # Recursively process the list
elif isinstance(element, str): # Check if the element is a string
result += element # Append the string to the result
return result
# Convert the nested list into a single string
full_text = nested_list_to_string(page_0)
# Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
start_index = full_text.find('Abstract')
end_index = full_text.find('Introduction')
# If both 'Abstract' and 'Introduction' are found, extract the text in between
if start_index != -1 and end_index != -1:
# Extract the text and remove the word 'Abstract'
abstract_text = full_text[start_index + len('Abstract'):end_index]
return abstract_text.strip()
else:
return "Abstract or Introduction section not found."
# Example usage
Page_0 = text_per_page['Page_0']
abstract_text = extract_abstract(Page_0)
wall_of_text = abstract_text
result = summarizer(
wall_of_text,
min_length=1,
max_length=30,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=3.5,
num_beams=4,
early_stopping=True,
)
# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
summary_string = result[0]['summary_text']
print(summary_string)
app = gra.Interface(fn = user_greeting, inputs=summary_string, outputs=summary_string)
app.launch()
|