vividsd commited on
Commit
b16f8b5
1 Parent(s): 6e412f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -178
app.py CHANGED
@@ -2,206 +2,76 @@
2
 
3
  # I tried to use my previous code but with some adaptions to any PDF that contains an abstract
4
 
5
- import gradio as gr
6
- from transformers import pipeline
7
- from tempfile import NamedTemporaryFile
8
  import PyPDF2
9
- from PyPDF2 import PdfReader
10
- from pdfminer.high_level import extract_pages, extract_text
11
- import pdfplumber
12
- from PIL import Image
13
- from pdf2image import convert_from_path
14
- from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
15
- import pytesseract
16
- import os
17
- import numpy as np
18
- import torch
19
- import sentencepiece
20
- import soundfile as sf
21
- from IPython.display import Audio
22
  from datasets import load_dataset
 
23
  from transformers import SpeechT5HifiGan
 
 
24
 
25
- def read_pdf(pdf_path):
26
- # create a PDF file object
27
- pdfFileObj = open(pdf_path, 'rb')
28
- # create a PDF reader object
29
- pdfReaded = PyPDF2.PdfReader(pdfFileObj)
30
-
31
- # Create the dictionary to extract text from each image
32
- text_per_page = {}
33
- # We extract the pages from the PDF
34
- for pagenum, page in enumerate(extract_pages(pdf_path)):
35
- print("Elaborating Page_" +str(pagenum))
36
- # Initialize the variables needed for the text extraction from the page
37
- pageObj = pdfReaded.pages[pagenum]
38
- page_text = []
39
- line_format = []
40
- text_from_images = []
41
- text_from_tables = []
42
- page_content = []
43
- # Initialize the number of the examined tables
44
- table_num = 0
45
- first_element= True
46
- table_extraction_flag= False
47
- # Open the pdf file
48
- pdf = pdfplumber.open(pdf_path)
49
- # Find the examined page
50
- page_tables = pdf.pages[pagenum]
51
- # Find the number of tables on the page
52
- tables = page_tables.find_tables()
53
-
54
-
55
- # Find all the elements
56
- page_elements = [(element.y1, element) for element in page._objs]
57
- # Sort all the elements as they appear in the page
58
- page_elements.sort(key=lambda a: a[0], reverse=True)
59
-
60
- # Find the elements that composed a page
61
- for i,component in enumerate(page_elements):
62
- # Extract the position of the top side of the element in the PDF
63
- pos= component[0]
64
- # Extract the element of the page layout
65
- element = component[1]
66
-
67
- # Check if the element is a text element
68
- if isinstance(element, LTTextContainer):
69
- # Check if the text appeared in a table
70
- if table_extraction_flag == False:
71
- # Use the function to extract the text and format for each text element
72
- (line_text, format_per_line) = text_extraction(element)
73
- # Append the text of each line to the page text
74
- page_text.append(line_text)
75
- # Append the format for each line containing text
76
- line_format.append(format_per_line)
77
- page_content.append(line_text)
78
- else:
79
- # Omit the text that appeared in a table
80
- pass
81
-
82
- # Check the elements for images
83
- if isinstance(element, LTFigure):
84
- # Crop the image from the PDF
85
- crop_image(element, pageObj)
86
- # Convert the cropped pdf to an image
87
- convert_to_images('cropped_image.pdf')
88
- # Extract the text from the image
89
- image_text = image_to_text('PDF_image.png')
90
- text_from_images.append(image_text)
91
- page_content.append(image_text)
92
- # Add a placeholder in the text and format lists
93
- page_text.append('image')
94
- line_format.append('image')
95
-
96
- # Check the elements for tables
97
- if isinstance(element, LTRect):
98
- # If the first rectangular element
99
- if first_element == True and (table_num+1) <= len(tables):
100
- # Find the bounding box of the table
101
- lower_side = page.bbox[3] - tables[table_num].bbox[3]
102
- upper_side = element.y1
103
- # Extract the information from the table
104
- table = extract_table(pdf_path, pagenum, table_num)
105
- # Convert the table information in structured string format
106
- table_string = table_converter(table)
107
- # Append the table string into a list
108
- text_from_tables.append(table_string)
109
- page_content.append(table_string)
110
- # Set the flag as True to avoid the content again
111
- table_extraction_flag = True
112
- # Make it another element
113
- first_element = False
114
- # Add a placeholder in the text and format lists
115
- page_text.append('table')
116
- line_format.append('table')
117
-
118
- # Check if we already extracted the tables from the page
119
- if element.y0 >= lower_side and element.y1 <= upper_side:
120
- pass
121
- elif not isinstance(page_elements[i+1][1], LTRect):
122
- table_extraction_flag = False
123
- first_element = True
124
- table_num+=1
125
-
126
-
127
- # Create the key of the dictionary
128
- dctkey = 'Page_'+str(pagenum)
129
- # Add the list of list as the value of the page key
130
- text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
131
-
132
- # Closing the pdf file object
133
- pdfFileObj.close()
134
-
135
- return text_per_page
136
-
137
- pdf_path = pdf_file.name
138
-
139
- text_per_page = read_pdf(pdf_path)
140
-
141
- page_0 = text_per_page['Page_0']
142
-
143
- page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
144
- for i in range(len(page_0_clean)):
145
- page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()
146
-
147
- #intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
148
 
149
- abstract = 'abstract'
150
- found_abstract = False
151
- intro_string ='introduction'
152
- extracted_abstract =""
153
- extracted_abstract = extracted_text_string.replace("Abstract", "")
154
 
155
- file = text.splitlines()
156
- for lines in file:
157
- lower_lines = lines.lower()
158
- if lower_lines.strip()== abstract:
159
- found_abstract = True
160
- elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
161
- found_abstract = False
162
 
163
- #summarizing the abstract
164
-
165
  from transformers import pipeline
166
  summarizer = pipeline("summarization", model="Falconsai/text_summarization")
167
- text1 = extracted_abstract
168
- print(summarizer(text1, max_length=20, min_length=10, do_sample=False))
169
-
170
- #in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know
171
-
172
- sentence = summarized_text[0]['summary_text']
173
-
174
- # generating the audio of the output by using my previous code
175
-
176
-
177
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
178
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
179
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
180
-
181
-
182
- text = sentence
183
- inputs = processor(text=sentence, return_tensors="pt")
184
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
185
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
186
- spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
187
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
188
- with torch.no_grad():
189
  speech = vocoder(spectrogram)
190
 
191
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
192
  Audio(speech, rate=16000)
193
 
194
-
195
  # Creating the Gradio app
196
  input_component = gr.File(file_types=["pdf"])
197
  output_component = gr.Audio()
198
 
199
  demo = gr.Interface(
200
- fn=read_pdf,
201
  inputs=input_component,
202
  outputs=output_component,
203
  title="Reading your abstract summary outloud",
204
- description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract"
205
  )
206
 
207
  demo.launch()
 
2
 
3
  # I tried to use my previous code but with some adaptions to any PDF that contains an abstract
4
 
5
+ #imports
 
 
6
  import PyPDF2
7
+ from transformers import pipeline
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
 
 
 
 
 
 
 
 
 
 
 
9
  from datasets import load_dataset
10
+ import torch
11
  from transformers import SpeechT5HifiGan
12
+ from gradio import gr
13
+ import gradio as gr
14
 
15
+ # Now copying my code and adapting it for any PDF
16
+ def extract_abstract(paper_filename):
17
+ with open(paper_filename, 'rb') as file:
18
+ reader = PyPDF2.PdfReader(file)
19
+ text = reader.pages[0].extract_text()
20
+
21
+ # in order to extract the exact part on the first page that is useful to me,
22
+ # I needed to consider that the papers follow a pattern in which after the Abstract, there is an Introduction
23
+ # and then cut the text right before the introduction
24
+
25
+ abstract_start_index = text.find('Abstract')
26
+ introduction_start_index = text.find('Introduction')
27
+
28
+ if abstract_start_index == -1 or introduction_start_index == -1:
29
+ return "" # Abstract or introduction section not found
30
+
31
+ abstract = text[abstract_start_index + len('Abstract'):introduction_start_index].strip()
32
+ return abstract
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ return ""
 
 
 
 
35
 
36
+ paper_filename = '/content/Article_11'
37
+ abstract_text = extract_abstract(paper_filename)
38
+ print(abstract_text)
 
 
 
 
39
 
 
 
40
  from transformers import pipeline
41
  summarizer = pipeline("summarization", model="Falconsai/text_summarization")
42
+ print(summarizer(abstract_text, max_length=25, min_length=10, do_sample=False))
43
+
44
+ output = summarizer(abstract_text, max_length=26, min_length=10, do_sample=False)
45
+ summary = output[0]['summary_text']
46
+ print(summary)
47
+
48
+ # proceeding to the audio function
49
+
50
+ def audio(text):
51
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
52
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
53
+ summary
54
+ inputs = processor(text=summary, return_tensors="pt")
55
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
56
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
57
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
58
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
59
+ with torch.no_grad():
 
 
 
 
60
  speech = vocoder(spectrogram)
61
 
62
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
63
  Audio(speech, rate=16000)
64
 
 
65
  # Creating the Gradio app
66
  input_component = gr.File(file_types=["pdf"])
67
  output_component = gr.Audio()
68
 
69
  demo = gr.Interface(
70
+ fn=audio,
71
  inputs=input_component,
72
  outputs=output_component,
73
  title="Reading your abstract summary outloud",
74
+ description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract followed by one called Introduction"
75
  )
76
 
77
  demo.launch()