Mishmosh commited on
Commit
257f459
·
1 Parent(s): 28d7f8c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -0
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/spaces/Mishmosh/MichelleAssessment3
2
+ !pip install PyPDF2
3
+ !pip install sentencepiece
4
+
5
+ !pip install pdfminer.six
6
+ !pip install pdfplumber
7
+ !pip install pdf2image
8
+ !pip install Pillow
9
+ !pip install pytesseract
10
+ !apt-get install poppler-utils
11
+ !apt install tesseract-ocr
12
+ !apt install libtesseract-dev
13
+ import PyPDF2
14
+ # To analyze the PDF layout and extract text
15
+ from pdfminer.high_level import extract_pages, extract_text
16
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
17
+ # To extract text from tables in PDF
18
+ import pdfplumber
19
+ # To extract the images from the PDFs
20
+ from PIL import Image
21
+ from pdf2image import convert_from_path
22
+ # To perform OCR to extract text from images
23
+ import pytesseract
24
+ # To remove the additional created files
25
+ import os
26
+
27
+ # @title
28
+ # Create a function to extract text
29
+
30
+ def text_extraction(element):
31
+ # Extracting the text from the in-line text element
32
+ line_text = element.get_text()
33
+
34
+ # Find the formats of the text
35
+ # Initialize the list with all the formats that appeared in the line of text
36
+ line_formats = []
37
+ for text_line in element:
38
+ if isinstance(text_line, LTTextContainer):
39
+ # Iterating through each character in the line of text
40
+ for character in text_line:
41
+ if isinstance(character, LTChar):
42
+ # Append the font name of the character
43
+ line_formats.append(character.fontname)
44
+ # Append the font size of the character
45
+ line_formats.append(character.size)
46
+ # Find the unique font sizes and names in the line
47
+ format_per_line = list(set(line_formats))
48
+
49
+ # Return a tuple with the text in each line along with its format
50
+ return (line_text, format_per_line)
51
+
52
+ # @title
53
+ # Create a function to crop the image elements from PDFs
54
+ def crop_image(element, pageObj):
55
+ # Get the coordinates to crop the image from the PDF
56
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
57
+ # Crop the page using coordinates (left, bottom, right, top)
58
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
59
+ pageObj.mediabox.upper_right = (image_right, image_top)
60
+ # Save the cropped page to a new PDF
61
+ cropped_pdf_writer = PyPDF2.PdfWriter()
62
+ cropped_pdf_writer.add_page(pageObj)
63
+ # Save the cropped PDF to a new file
64
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
65
+ cropped_pdf_writer.write(cropped_pdf_file)
66
+
67
+ # Create a function to convert the PDF to images
68
+ def convert_to_images(input_file,):
69
+ images = convert_from_path(input_file)
70
+ image = images[0]
71
+ output_file = "PDF_image.png"
72
+ image.save(output_file, "PNG")
73
+
74
+ # Create a function to read text from images
75
+ def image_to_text(image_path):
76
+ # Read the image
77
+ img = Image.open(image_path)
78
+ # Extract the text from the image
79
+ text = pytesseract.image_to_string(img)
80
+ return text
81
+
82
+ # @title
83
+ # Extracting tables from the page
84
+
85
+ def extract_table(pdf_path, page_num, table_num):
86
+ # Open the pdf file
87
+ pdf = pdfplumber.open(pdf_path)
88
+ # Find the examined page
89
+ table_page = pdf.pages[page_num]
90
+ # Extract the appropriate table
91
+ table = table_page.extract_tables()[table_num]
92
+ return table
93
+
94
+ # Convert table into the appropriate format
95
+ def table_converter(table):
96
+ table_string = ''
97
+ # Iterate through each row of the table
98
+ for row_num in range(len(table)):
99
+ row = table[row_num]
100
+ # Remove the line breaker from the wrapped texts
101
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
102
+ # Convert the table into a string
103
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
104
+ # Removing the last line break
105
+ table_string = table_string[:-1]
106
+ return table_string
107
+
108
+ # @title
109
+ def read_pdf(pdf_path):
110
+ # create a PDF file object
111
+ pdfFileObj = open(pdf_path, 'rb')
112
+ # create a PDF reader object
113
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
114
+
115
+ # Create the dictionary to extract text from each image
116
+ text_per_page = {}
117
+ # We extract the pages from the PDF
118
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
119
+ print("Elaborating Page_" +str(pagenum))
120
+ # Initialize the variables needed for the text extraction from the page
121
+ pageObj = pdfReaded.pages[pagenum]
122
+ page_text = []
123
+ line_format = []
124
+ text_from_images = []
125
+ text_from_tables = []
126
+ page_content = []
127
+ # Initialize the number of the examined tables
128
+ table_num = 0
129
+ first_element= True
130
+ table_extraction_flag= False
131
+ # Open the pdf file
132
+ pdf = pdfplumber.open(pdf_path)
133
+ # Find the examined page
134
+ page_tables = pdf.pages[pagenum]
135
+ # Find the number of tables on the page
136
+ tables = page_tables.find_tables()
137
+
138
+
139
+ # Find all the elements
140
+ page_elements = [(element.y1, element) for element in page._objs]
141
+ # Sort all the elements as they appear in the page
142
+ page_elements.sort(key=lambda a: a[0], reverse=True)
143
+
144
+ # Find the elements that composed a page
145
+ for i,component in enumerate(page_elements):
146
+ # Extract the position of the top side of the element in the PDF
147
+ pos= component[0]
148
+ # Extract the element of the page layout
149
+ element = component[1]
150
+
151
+ # Check if the element is a text element
152
+ if isinstance(element, LTTextContainer):
153
+ # Check if the text appeared in a table
154
+ if table_extraction_flag == False:
155
+ # Use the function to extract the text and format for each text element
156
+ (line_text, format_per_line) = text_extraction(element)
157
+ # Append the text of each line to the page text
158
+ page_text.append(line_text)
159
+ # Append the format for each line containing text
160
+ line_format.append(format_per_line)
161
+ page_content.append(line_text)
162
+ else:
163
+ # Omit the text that appeared in a table
164
+ pass
165
+
166
+ # Check the elements for images
167
+ if isinstance(element, LTFigure):
168
+ # Crop the image from the PDF
169
+ crop_image(element, pageObj)
170
+ # Convert the cropped pdf to an image
171
+ convert_to_images('cropped_image.pdf')
172
+ # Extract the text from the image
173
+ image_text = image_to_text('PDF_image.png')
174
+ text_from_images.append(image_text)
175
+ page_content.append(image_text)
176
+ # Add a placeholder in the text and format lists
177
+ page_text.append('image')
178
+ line_format.append('image')
179
+
180
+ # Check the elements for tables
181
+ if isinstance(element, LTRect):
182
+ # If the first rectangular element
183
+ if first_element == True and (table_num+1) <= len(tables):
184
+ # Find the bounding box of the table
185
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
186
+ upper_side = element.y1
187
+ # Extract the information from the table
188
+ table = extract_table(pdf_path, pagenum, table_num)
189
+ # Convert the table information in structured string format
190
+ table_string = table_converter(table)
191
+ # Append the table string into a list
192
+ text_from_tables.append(table_string)
193
+ page_content.append(table_string)
194
+ # Set the flag as True to avoid the content again
195
+ table_extraction_flag = True
196
+ # Make it another element
197
+ first_element = False
198
+ # Add a placeholder in the text and format lists
199
+ page_text.append('table')
200
+ line_format.append('table')
201
+
202
+ # Check if we already extracted the tables from the page
203
+ if element.y0 >= lower_side and element.y1 <= upper_side:
204
+ pass
205
+ elif not isinstance(page_elements[i+1][1], LTRect):
206
+ table_extraction_flag = False
207
+ first_element = True
208
+ table_num+=1
209
+
210
+
211
+ # Create the key of the dictionary
212
+ dctkey = 'Page_'+str(pagenum)
213
+ # Add the list of list as the value of the page key
214
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
215
+
216
+ # Closing the pdf file object
217
+ pdfFileObj.close()
218
+
219
+ # Deleting the additional files created
220
+ #os.remove('cropped_image.pdf')
221
+ #os.remove('PDF_image.png')
222
+ return text_per_page
223
+
224
+ ## Access the files from Google Drive
225
+
226
+ from google.colab import drive
227
+ drive.mount('/content/drive')
228
+
229
+ ### Let's read the PDF
230
+
231
+
232
+ pdf_path = '/content/drive/MyDrive/ArticleHidden.pdf' #article 11
233
+
234
+ text_per_page = read_pdf(pdf_path)
235
+
236
+ text_per_page.keys() #check the keys of dictionary
237
+
238
+ type(text_per_page) #check type to see its a dictionary
239
+
240
+ # This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
241
+ #to try this here since the formatting of the text has already been removed.
242
+ # Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
243
+ abstract_from_pdf='' # define empty variable that will hold the text from the abstract
244
+ found_abstract=False # has the abstract been found
245
+ for key in text_per_page.keys(): # go through keys in dictionary
246
+ current_item=text_per_page[key] #current key
247
+ for paragraphs in current_item: #go through each item
248
+ for index,paragraph in enumerate(paragraphs): #go through each line
249
+ if 'Abstract\n' == paragraph: #does line match paragraph
250
+ found_abstract=True #word abstract has been found
251
+ abstract_from_pdf=paragraphs[index+1] #get next paragraph
252
+ if found_abstract: #if abstract found
253
+ break
254
+ print(abstract_from_pdf)
255
+
256
+ from transformers import pipeline
257
+ summarizer = pipeline("summarization", model="ainize/bart-base-cnn")
258
+ #summarizer = pipeline("summarization", model="linydub/bart-large-samsum") # various models were tried and the best one was selected
259
+ #summarizer = pipeline("summarization", model="slauw87/bart_summarisation")
260
+ #summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
261
+ #summarizer = pipeline("summarization", model="google/pegasus-cnn_dailymail")
262
+ #print(summarizer(abstract_from_pdf, max_length=50, min_length=5, do_sample=False))
263
+ summarized_text=(summarizer(abstract_from_pdf))
264
+ print(summarized_text)
265
+ #summary_of_abstract=str(summarizer)
266
+ #type(summary_of_abstract)
267
+ #print(summary_of_abstract)
268
+
269
+ type(summarized_text)
270
+
271
+
272
+ summarized_text_list=summarized_text[0]
273
+
274
+
275
+ print(summarized_text_list)
276
+
277
+
278
+ type(summarized_text_list)
279
+
280
+ summarized_text_list_list=summarized_text_list['summary_text']
281
+
282
+ print(summarized_text_list_list)
283
+
284
+ type(summarized_text_list_list)
285
+
286
+ # the aim of this section of code is to get a summary of just one sentence by summarizing the summary all while the summary is longer than one sentence.
287
+ # unfortunately, I tried many many models and none of them actually summarize the text to as short as one sentence.
288
+ #I had searched for ways to fine tune the summarization model to specify that the summarization should be done in just one sentence but did not find a way to implement it
289
+ from transformers import pipeline
290
+ summarized_text_list_list=summarized_text_list['summary_text']
291
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
292
+ #print(summarizer)
293
+ number_of_sentences=summarized_text_list_list.count('.')
294
+ print(number_of_sentences)
295
+ while(number_of_sentences)>1:
296
+ print(number_of_sentences)
297
+ summarized_text_list_list=summarizer(summarized_text_list_list)[0]['summary_text']
298
+ number_of_sentences-=1
299
+ print(summarized_text_list_list)
300
+ print(number_of_sentences)
301
+
302
+ #text to speech
303
+
304
+ !pip install git+https://github.com/huggingface/transformers.git
305
+ !pip install datasets sentencepiece
306
+ import torch
307
+ import soundfile as sf
308
+ from IPython.display import Audio
309
+ from datasets import load_dataset
310
+ from transformers import pipeline
311
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
312
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
313
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
314
+ #text = "The future belongs to those who believe in the beauty of their dreams."
315
+ #text = (summarized_text_list_list)
316
+
317
+ inputs = processor(text=summarized_text_list_list, return_tensors="pt")
318
+ from datasets import load_dataset
319
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
320
+
321
+ import torch
322
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
323
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
324
+ from transformers import SpeechT5HifiGan
325
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
326
+ with torch.no_grad():
327
+ speech = vocoder(spectrogram)
328
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
329
+ Audio(speech, rate=16000)
330
+