Mishmosh commited on
Commit
945165c
·
1 Parent(s): 19e94a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -218
app.py CHANGED
@@ -29,221 +29,3 @@ from PIL import Image
29
  from pdf2image import convert_from_path
30
  import pytesseract
31
  import os
32
- def text_extraction(element):
33
- # Extracting the text from the in-line text element
34
- line_text = element.get_text()
35
-
36
- # Find the formats of the text
37
- # Initialize the list with all the formats that appeared in the line of text
38
- line_formats = []
39
- for text_line in element:
40
- if isinstance(text_line, LTTextContainer):
41
- # Iterating through each character in the line of text
42
- for character in text_line:
43
- if isinstance(character, LTChar):
44
- # Append the font name of the character
45
- line_formats.append(character.fontname)
46
- # Append the font size of the character
47
- line_formats.append(character.size)
48
- # Find the unique font sizes and names in the line
49
- format_per_line = list(set(line_formats))
50
-
51
- # Return a tuple with the text in each line along with its format
52
- return (line_text, format_per_line)
53
- # @title
54
- # Create a function to crop the image elements from PDFs
55
- def crop_image(element, pageObj):
56
- # Get the coordinates to crop the image from the PDF
57
- [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
58
- # Crop the page using coordinates (left, bottom, right, top)
59
- pageObj.mediabox.lower_left = (image_left, image_bottom)
60
- pageObj.mediabox.upper_right = (image_right, image_top)
61
- # Save the cropped page to a new PDF
62
- cropped_pdf_writer = PyPDF2.PdfWriter()
63
- cropped_pdf_writer.add_page(pageObj)
64
- # Save the cropped PDF to a new file
65
- with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
66
- cropped_pdf_writer.write(cropped_pdf_file)
67
-
68
- # Create a function to convert the PDF to images
69
- def convert_to_images(input_file,):
70
- images = convert_from_path(input_file)
71
- image = images[0]
72
- output_file = "PDF_image.png"
73
- image.save(output_file, "PNG")
74
-
75
- # Create a function to read text from images
76
- def image_to_text(image_path):
77
- # Read the image
78
- img = Image.open(image_path)
79
- # Extract the text from the image
80
- text = pytesseract.image_to_string(img)
81
- return text
82
- # @title
83
- # Extracting tables from the page
84
-
85
- def extract_table(pdf_path, page_num, table_num):
86
- # Open the pdf file
87
- pdf = pdfplumber.open(pdf_path)
88
- # Find the examined page
89
- table_page = pdf.pages[page_num]
90
- # Extract the appropriate table
91
- table = table_page.extract_tables()[table_num]
92
- return table
93
-
94
- # Convert table into the appropriate format
95
- def table_converter(table):
96
- table_string = ''
97
- # Iterate through each row of the table
98
- for row_num in range(len(table)):
99
- row = table[row_num]
100
- # Remove the line breaker from the wrapped texts
101
- cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
102
- # Convert the table into a string
103
- table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
104
- # Removing the last line break
105
- table_string = table_string[:-1]
106
- return table_string
107
- # @title
108
- def read_pdf(pdf_path):
109
- # create a PDF file object
110
- pdfFileObj = open(pdf_path, 'rb')
111
- # create a PDF reader object
112
- #pdfReaded = PyPDF2.PdfReader(pdfFileObj) #coded out as suggested by chatgpt
113
- pdfReaded = PyPDF2.PdfFileReader(pdfFileObj)
114
-
115
- # Create the dictionary to extract text from each image
116
- text_per_page = {}
117
- # We extract the pages from the PDF
118
- for pagenum, page in enumerate(extract_pages(pdf_path)):
119
- print("Elaborating Page_" +str(pagenum))
120
- # Initialize the variables needed for the text extraction from the page
121
- pageObj = pdfReaded.pages[pagenum]
122
- page_text = []
123
- line_format = []
124
- text_from_images = []
125
- text_from_tables = []
126
- page_content = []
127
- # Initialize the number of the examined tables
128
- table_num = 0
129
- first_element= True
130
- table_extraction_flag= False
131
- # Open the pdf file
132
- pdf = pdfplumber.open(pdf_path)
133
- # Find the examined page
134
- page_tables = pdf.pages[pagenum]
135
- # Find the number of tables on the page
136
- tables = page_tables.find_tables()
137
-
138
-
139
- # Find all the elements
140
- page_elements = [(element.y1, element) for element in page._objs]
141
- # Sort all the elements as they appear in the page
142
- page_elements.sort(key=lambda a: a[0], reverse=True)
143
-
144
- # Find the elements that composed a page
145
- for i,component in enumerate(page_elements):
146
- # Extract the position of the top side of the element in the PDF
147
- pos= component[0]
148
- # Extract the element of the page layout
149
- element = component[1]
150
-
151
- # Check if the element is a text element
152
- if isinstance(element, LTTextContainer):
153
- # Check if the text appeared in a table
154
- if table_extraction_flag == False:
155
- # Use the function to extract the text and format for each text element
156
- (line_text, format_per_line) = text_extraction(element)
157
- # Append the text of each line to the page text
158
- page_text.append(line_text)
159
- # Append the format for each line containing text
160
- line_format.append(format_per_line)
161
- page_content.append(line_text)
162
- else:
163
- # Omit the text that appeared in a table
164
- pass
165
-
166
- # Check the elements for images
167
- if isinstance(element, LTFigure):
168
- # Crop the image from the PDF
169
- crop_image(element, pageObj)
170
- # Convert the cropped pdf to an image
171
- convert_to_images('cropped_image.pdf')
172
- # Extract the text from the image
173
- image_text = image_to_text('PDF_image.png')
174
- text_from_images.append(image_text)
175
- page_content.append(image_text)
176
- # Add a placeholder in the text and format lists
177
- page_text.append('image')
178
- line_format.append('image')
179
-
180
- # Check the elements for tables
181
- if isinstance(element, LTRect):
182
- # If the first rectangular element
183
- if first_element == True and (table_num+1) <= len(tables):
184
- # Find the bounding box of the table
185
- lower_side = page.bbox[3] - tables[table_num].bbox[3]
186
- upper_side = element.y1
187
- # Extract the information from the table
188
- table = extract_table(pdf_path, pagenum, table_num)
189
- # Convert the table information in structured string format
190
- table_string = table_converter(table)
191
- # Append the table string into a list
192
- text_from_tables.append(table_string)
193
- page_content.append(table_string)
194
- # Set the flag as True to avoid the content again
195
- table_extraction_flag = True
196
- # Make it another element
197
- first_element = False
198
- # Add a placeholder in the text and format lists
199
- page_text.append('table')
200
- line_format.append('table')
201
-
202
- # Check if we already extracted the tables from the page
203
- if element.y0 >= lower_side and element.y1 <= upper_side:
204
- pass
205
- elif not isinstance(page_elements[i+1][1], LTRect):
206
- table_extraction_flag = False
207
- first_element = True
208
- table_num+=1
209
-
210
-
211
- # Create the key of the dictionary
212
- dctkey = 'Page_'+str(pagenum)
213
- # Add the list of list as the value of the page key
214
- text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
215
-
216
- # Closing the pdf file object
217
- pdfFileObj.close()
218
-
219
- # Deleting the additional files created
220
- #os.remove('cropped_image.pdf')
221
- #os.remove('PDF_image.png')
222
- return text_per_page
223
-
224
- #google drive
225
- #from google.colab import drive
226
- #drive.mount('/content/drive')
227
- #read PDF
228
-
229
- pdf_path = 'test.pdf' #article 11
230
- #pdf_path = 'https://huggingface.co/spaces/Mishmosh/MichelleAssessment3/blob/main/test.pdf' #article 11
231
-
232
- text_per_page = read_pdf(pdf_path)
233
-
234
- # This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
235
- #to try this here since the formatting of the text has already been removed.
236
- # Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
237
- abstract_from_pdf='' # define empty variable that will hold the text from the abstract
238
- found_abstract=False # has the abstract been found
239
- for key in text_per_page.keys(): # go through keys in dictionary
240
- current_item=text_per_page[key] #current key
241
- for paragraphs in current_item: #go through each item
242
- for index,paragraph in enumerate(paragraphs): #go through each line
243
- if 'Abstract\n' == paragraph: #does line match paragraph
244
- found_abstract=True #word abstract has been found
245
- abstract_from_pdf=paragraphs[index+1] #get next paragraph
246
- if found_abstract: #if abstract found
247
- break
248
- print(abstract_from_pdf)
249
-
 
29
  from pdf2image import convert_from_path
30
  import pytesseract
31
  import os