vividsd commited on
Commit
a92e01d
1 Parent(s): 589fdb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -59
app.py CHANGED
@@ -8,79 +8,182 @@ from bark import SAMPLE_RATE, generate_audio, preload_models
8
  from scipy.io.wavfile import write as write_wav
9
  import torch
10
 
11
- def summarize_abstract_from_pdf(pdf_file_path):
12
- abstract_string = 'abstract'
13
- found_abstract = False
14
- intro_string ='introduction'
15
- extracted_text_string =""
16
 
17
- # Read the PDF and extract text from the first page
18
- with open(pdf_file_path, 'rb') as pdf_file:
19
- reader = PdfReader(pdf_file)
20
- text = ""
21
- text += reader.pages[0].extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
- file = text.splitlines()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  for lines in file:
26
  lower_lines = lines.lower()
27
- if lower_lines.strip()== abstract_string:
28
  found_abstract = True
29
  elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
30
  found_abstract = False
31
 
32
- if found_abstract == True:
33
- extracted_text_string += lines
34
-
35
-
36
- extracted_text_string = extracted_text_string.replace("Abstract", "")
37
- summarizer = pipeline("summarization", "pszemraj/led-base-book-summary",device=0 if torch.cuda.is_available() else -1,)
38
- # Generate a summarized abstract using the specified model
39
- summarized_abstract = summarizer(extracted_text_string,
40
- min_length=16,
41
- max_length=150,
42
- no_repeat_ngram_size=3,
43
- encoder_no_repeat_ngram_size=3,
44
- repetition_penalty=3.5,
45
- num_beams=4,
46
- early_stopping=True,
47
- )
48
- #I run this twice to get summazired text
49
- summarized_abstract2 = summarizer(summarized_abstract[0]['summary_text'],
50
- min_length=16,
51
- max_length=25,
52
- no_repeat_ngram_size=3,
53
- encoder_no_repeat_ngram_size=3,
54
- repetition_penalty=3.5,
55
- num_beams=4,
56
- early_stopping=True,
57
- )
58
-
59
-
60
-
61
- # Return the summarized abstract as a string
62
- return summarized_abstract2[0]['summary_text']
63
-
64
- def generate_audio_func(pdf_file):
65
-
66
- pdf_file_path = pdf_file.name
67
- # Generate audio from text
68
- #call the summarize abstract function
69
- text_prompt = summarize_abstract_from_pdf(pdf_file_path)
70
- audio_array = generate_audio(text_prompt)
71
-
72
- # Create a temporary WAV file to save the audio
73
- with NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
74
- wav_file_path = temp_wav_file.name
75
- write_wav(wav_file_path, 22050, (audio_array * 32767).astype(np.int16))
76
- return wav_file_path
77
 
78
- # Create the Gradio app
79
  input_component = gr.File(file_types=["pdf"])
80
  output_component = gr.Audio()
81
 
82
  demo = gr.Interface(
83
- fn=generate_audio_func,
84
  inputs=input_component,
85
  outputs=output_component,
86
  title="Reading your abstract summary outloud",
 
8
  from scipy.io.wavfile import write as write_wav
9
  import torch
10
 
11
+ def read_pdf(pdf_path):
12
+ # create a PDF file object
13
+ pdfFileObj = open('/content/Article_11', 'rb')
14
+ # create a PDF reader object
15
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
16
 
17
+ # Create the dictionary to extract text from each image
18
+ text_per_page = {}
19
+ # We extract the pages from the PDF
20
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
21
+ print("Elaborating Page_" +str(pagenum))
22
+ # Initialize the variables needed for the text extraction from the page
23
+ pageObj = pdfReaded.pages[pagenum]
24
+ page_text = []
25
+ line_format = []
26
+ text_from_images = []
27
+ text_from_tables = []
28
+ page_content = []
29
+ # Initialize the number of the examined tables
30
+ table_num = 0
31
+ first_element= True
32
+ table_extraction_flag= False
33
+ # Open the pdf file
34
+ pdf = pdfplumber.open(pdf_path)
35
+ # Find the examined page
36
+ page_tables = pdf.pages[pagenum]
37
+ # Find the number of tables on the page
38
+ tables = page_tables.find_tables()
39
 
40
 
41
+ # Find all the elements
42
+ page_elements = [(element.y1, element) for element in page._objs]
43
+ # Sort all the elements as they appear in the page
44
+ page_elements.sort(key=lambda a: a[0], reverse=True)
45
+
46
+ # Find the elements that composed a page
47
+ for i,component in enumerate(page_elements):
48
+ # Extract the position of the top side of the element in the PDF
49
+ pos= component[0]
50
+ # Extract the element of the page layout
51
+ element = component[1]
52
+
53
+ # Check if the element is a text element
54
+ if isinstance(element, LTTextContainer):
55
+ # Check if the text appeared in a table
56
+ if table_extraction_flag == False:
57
+ # Use the function to extract the text and format for each text element
58
+ (line_text, format_per_line) = text_extraction(element)
59
+ # Append the text of each line to the page text
60
+ page_text.append(line_text)
61
+ # Append the format for each line containing text
62
+ line_format.append(format_per_line)
63
+ page_content.append(line_text)
64
+ else:
65
+ # Omit the text that appeared in a table
66
+ pass
67
+
68
+ # Check the elements for images
69
+ if isinstance(element, LTFigure):
70
+ # Crop the image from the PDF
71
+ crop_image(element, pageObj)
72
+ # Convert the cropped pdf to an image
73
+ convert_to_images('cropped_image.pdf')
74
+ # Extract the text from the image
75
+ image_text = image_to_text('PDF_image.png')
76
+ text_from_images.append(image_text)
77
+ page_content.append(image_text)
78
+ # Add a placeholder in the text and format lists
79
+ page_text.append('image')
80
+ line_format.append('image')
81
+
82
+ # Check the elements for tables
83
+ if isinstance(element, LTRect):
84
+ # If the first rectangular element
85
+ if first_element == True and (table_num+1) <= len(tables):
86
+ # Find the bounding box of the table
87
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
88
+ upper_side = element.y1
89
+ # Extract the information from the table
90
+ table = extract_table(pdf_path, pagenum, table_num)
91
+ # Convert the table information in structured string format
92
+ table_string = table_converter(table)
93
+ # Append the table string into a list
94
+ text_from_tables.append(table_string)
95
+ page_content.append(table_string)
96
+ # Set the flag as True to avoid the content again
97
+ table_extraction_flag = True
98
+ # Make it another element
99
+ first_element = False
100
+ # Add a placeholder in the text and format lists
101
+ page_text.append('table')
102
+ line_format.append('table')
103
+
104
+ # Check if we already extracted the tables from the page
105
+ if element.y0 >= lower_side and element.y1 <= upper_side:
106
+ pass
107
+ elif not isinstance(page_elements[i+1][1], LTRect):
108
+ table_extraction_flag = False
109
+ first_element = True
110
+ table_num+=1
111
+
112
+
113
+ # Create the key of the dictionary
114
+ dctkey = 'Page_'+str(pagenum)
115
+ # Add the list of list as the value of the page key
116
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
117
+
118
+ # Closing the pdf file object
119
+ pdfFileObj.close()
120
+
121
+ return text_per_page
122
+
123
+ pdf_path = pdf_file.name
124
+
125
+ text_per_page = read_pdf(pdf_path)
126
+
127
+ page_0 = text_per_page['Page_0']
128
+
129
+ page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
130
+ for i in range(len(page_0_clean)):
131
+ page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()
132
+
133
+ #intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
134
+
135
+ abstract = 'abstract'
136
+ found_abstract = False
137
+ intro_string ='introduction'
138
+ extracted_abstract =""
139
+ extracted_abstract = extracted_text_string.replace("Abstract", "")
140
+
141
+ file = text.splitlines()
142
  for lines in file:
143
  lower_lines = lines.lower()
144
+ if lower_lines.strip()== abstract:
145
  found_abstract = True
146
  elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
147
  found_abstract = False
148
 
149
+ #summarizing the abstract
150
+
151
+ from transformers import pipeline
152
+ summarizer = pipeline("summarization", model="Falconsai/text_summarization")
153
+ text1 = extracted_abstract
154
+ print(summarizer(text1, max_length=20, min_length=10, do_sample=False))
155
+
156
+ #in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know
157
+
158
+ sentence = summarized_text[0]['summary_text']
159
+
160
+ # generating the audio of the output by using my previous code
161
+
162
+
163
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
164
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
165
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
166
+
167
+
168
+ text = sentence
169
+ inputs = processor(text=sentence, return_tensors="pt")
170
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
171
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
172
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
173
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
174
+ with torch.no_grad():
175
+ speech = vocoder(spectrogram)
176
+
177
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
178
+ Audio(speech, rate=16000)
179
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # Creating the Gradio app
182
  input_component = gr.File(file_types=["pdf"])
183
  output_component = gr.Audio()
184
 
185
  demo = gr.Interface(
186
+ fn=read_pdf,
187
  inputs=input_component,
188
  outputs=output_component,
189
  title="Reading your abstract summary outloud",