File size: 7,927 Bytes
dd5c0ad
 
3075f85
 
96ebfa6
c7ebf67
96ebfa6
c7ebf67
 
 
 
 
 
 
96ebfa6
3075f85
c7ebf67
 
 
 
 
3075f85
a92e01d
 
c7ebf67
a92e01d
 
3075f85
a92e01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96ebfa6
 
a92e01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96ebfa6
 
a92e01d
96ebfa6
 
 
 
a92e01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96ebfa6
a92e01d
96ebfa6
 
 
 
a92e01d
96ebfa6
 
d6d00d9
 
589fdb6
96ebfa6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# my space: https://huggingface.co/spaces/vividsd/practice

import gradio as gr
from transformers import pipeline
from tempfile import NamedTemporaryFile
import PyPDF2
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_pages, extract_text
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pytesseract
import os
import numpy as np
import torch
import sentencepiece
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import SpeechT5HifiGan

def read_pdf(pdf_path):
  # create a PDF file object
  pdfFileObj = open(pdf_path, 'rb')
  # create a PDF reader object
  pdfReaded = PyPDF2.PdfReader(pdfFileObj)

  # Create the dictionary to extract text from each image
  text_per_page = {}
  # We extract the pages from the PDF
  for pagenum, page in enumerate(extract_pages(pdf_path)):
      print("Elaborating Page_" +str(pagenum))
      # Initialize the variables needed for the text extraction from the page
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      text_from_images = []
      text_from_tables = []
      page_content = []
      # Initialize the number of the examined tables
      table_num = 0
      first_element= True
      table_extraction_flag= False
      # Open the pdf file
      pdf = pdfplumber.open(pdf_path)
      # Find the examined page
      page_tables = pdf.pages[pagenum]
      # Find the number of tables on the page
      tables = page_tables.find_tables()


      # Find all the elements
      page_elements = [(element.y1, element) for element in page._objs]
      # Sort all the elements as they appear in the page
      page_elements.sort(key=lambda a: a[0], reverse=True)

      # Find the elements that composed a page
      for i,component in enumerate(page_elements):
          # Extract the position of the top side of the element in the PDF
          pos= component[0]
          # Extract the element of the page layout
          element = component[1]

          # Check if the element is a text element
          if isinstance(element, LTTextContainer):
              # Check if the text appeared in a table
              if table_extraction_flag == False:
                  # Use the function to extract the text and format for each text element
                  (line_text, format_per_line) = text_extraction(element)
                  # Append the text of each line to the page text
                  page_text.append(line_text)
                  # Append the format for each line containing text
                  line_format.append(format_per_line)
                  page_content.append(line_text)
              else:
                  # Omit the text that appeared in a table
                  pass

          # Check the elements for images
          if isinstance(element, LTFigure):
              # Crop the image from the PDF
              crop_image(element, pageObj)
              # Convert the cropped pdf to an image
              convert_to_images('cropped_image.pdf')
              # Extract the text from the image
              image_text = image_to_text('PDF_image.png')
              text_from_images.append(image_text)
              page_content.append(image_text)
              # Add a placeholder in the text and format lists
              page_text.append('image')
              line_format.append('image')

          # Check the elements for tables
          if isinstance(element, LTRect):
              # If the first rectangular element
              if first_element == True and (table_num+1) <= len(tables):
                  # Find the bounding box of the table
                  lower_side = page.bbox[3] - tables[table_num].bbox[3]
                  upper_side = element.y1
                  # Extract the information from the table
                  table = extract_table(pdf_path, pagenum, table_num)
                  # Convert the table information in structured string format
                  table_string = table_converter(table)
                  # Append the table string into a list
                  text_from_tables.append(table_string)
                  page_content.append(table_string)
                  # Set the flag as True to avoid the content again
                  table_extraction_flag = True
                  # Make it another element
                  first_element = False
                  # Add a placeholder in the text and format lists
                  page_text.append('table')
                  line_format.append('table')

                  # Check if we already extracted the tables from the page
                  if element.y0 >= lower_side and element.y1 <= upper_side:
                      pass
                  elif not isinstance(page_elements[i+1][1], LTRect):
                      table_extraction_flag = False
                      first_element = True
                      table_num+=1


      # Create the key of the dictionary
      dctkey = 'Page_'+str(pagenum)
      # Add the list of list as the value of the page key
      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

  # Closing the pdf file object
  pdfFileObj.close()

  return text_per_page

pdf_path = pdf_file.name

text_per_page = read_pdf(pdf_path)

page_0 = text_per_page['Page_0']

page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
for i in range(len(page_0_clean)):
    page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()

#intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
    
abstract = 'abstract'
found_abstract = False
intro_string ='introduction'
extracted_abstract =""
extracted_abstract = extracted_text_string.replace("Abstract", "")

file = text.splitlines()
    for lines in file:
      lower_lines = lines.lower()
      if lower_lines.strip()== abstract:
        found_abstract = True
      elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
        found_abstract = False

#summarizing the abstract
          
from transformers import pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
text1 = extracted_abstract
print(summarizer(text1, max_length=20, min_length=10, do_sample=False))

#in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know

sentence = summarized_text[0]['summary_text']

# generating the audio of the output by using my previous code


from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")


text = sentence
inputs = processor(text=sentence, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
    speech = vocoder(spectrogram)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
Audio(speech, rate=16000)


# Creating the Gradio app
input_component = gr.File(file_types=["pdf"])
output_component = gr.Audio()

demo = gr.Interface(
    fn=read_pdf,
    inputs=input_component,
    outputs=output_component,
    title="Reading your abstract summary outloud",
    description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract"
)

demo.launch()