File size: 6,548 Bytes
bf35104
 
403f739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dccc366
4f526ed
403f739
45a9fe0
403f739
 
 
 
 
 
 
 
 
 
7bb7369
403f739
 
16fc878
8f0f72b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403f739
7879a97
 
0234904
7879a97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# https://huggingface.co/spaces/barser65/assessment3

def converti(path):
    import pip
    
    def install(package):
        if hasattr(pip, 'main'):
            pip.main(['install', package])
        else:
            pip._internal.main(['install', package])
    
    install('git+https://github.com/huggingface/transformers.git')
    install('datasets sentencepiece')
    install('PyPDF2')
    install('pdfminer.six')
    install('pdfplumber')
    install('poppler-utils')
    install('tesseract-ocr')
    install('libtesseract-dev')
    
    # To read the PDF
    import PyPDF2
    # To analyze the PDF layout and extract text
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    # To extract text from tables in PDF
    import pdfplumber
    # To remove the additional created files
    import os
    
    # Create a function to extract text
    
    def text_extraction(element):
        # Extracting the text from the in-line text element
        line_text = element.get_text()
    
        # Find the formats of the text
        # Initialize the list with all the formats that appeared in the line of text
        line_formats = []
        for text_line in element:
            if isinstance(text_line, LTTextContainer):
                # Iterating through each character in the line of text
                for character in text_line:
                    if isinstance(character, LTChar):
                        # Append the font name of the character
                        line_formats.append(character.fontname)
                        # Append the font size of the character
                        line_formats.append(character.size)
        # Find the unique font sizes and names in the line
        format_per_line = list(set(line_formats))
    
        # Return a tuple with the text in each line along with its format
        return (line_text, format_per_line)
    
    def read_pdf(pdf_path):
      # create a PDF file object
      pdfFileObj = open(pdf_path, 'rb')
      # create a PDF reader object
      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
    
      # Create the dictionary to extract text from each image
      text_per_page = {}
      # We extract the pages from the PDF
      for pagenum, page in enumerate(extract_pages(pdf_path)):
          print("Elaborating Page_" +str(pagenum))
          # Initialize the variables needed for the text extraction from the page
          pageObj = pdfReaded.pages[pagenum]
          page_text = []
          line_format = []
          text_from_images = []
          text_from_tables = []
          page_content = []
          # Initialize the number of the examined tables
          table_num = 0
          first_element= True
          table_extraction_flag= False
          # Open the pdf file
          pdf = pdfplumber.open(pdf_path)
          # Find the examined page
          page_tables = pdf.pages[pagenum]
          # Find the number of tables on the page
          tables = page_tables.find_tables()
    
    
          # Find all the elements
          page_elements = [(element.y1, element) for element in page._objs]
          # Sort all the elements as they appear in the page
          page_elements.sort(key=lambda a: a[0], reverse=True)
    
          # Find the elements that composed a page
          for i,component in enumerate(page_elements):
              # Extract the position of the top side of the element in the PDF
              pos= component[0]
              # Extract the element of the page layout
              element = component[1]
    
              # Check if the element is a text element
              if isinstance(element, LTTextContainer):
                  # Check if the text appeared in a table
                  if table_extraction_flag == False:
                      # Use the function to extract the text and format for each text element
                      (line_text, format_per_line) = text_extraction(element)
                      # Append the text of each line to the page text
                      page_text.append(line_text)
                      # Append the format for each line containing text
                      line_format.append(format_per_line)
                      page_content.append(line_text)
                  else:
                      # Omit the text that appeared in a table
                      pass
    
          # Create the key of the dictionary
          dctkey = 'Page_'+str(pagenum)
          # Add the list of list as the value of the page key
          text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
    
      # Closing the pdf file object
      pdfFileObj.close()
    
      return text_per_page

    pdf_path = path
    text_per_page = read_pdf(pdf_path)

    abstr = ''
    while len(abstr) == 0:
      for par in range(len(text_per_page)):
        for x in text_per_page['Page_'+str(par)]:
          mystring = ' '.join(map(str,x))
          if mystring.find('Abstract\n') > 0:
            abstr0 = mystring[mystring.find('Abstract\n')+10:]
            abstr = abstr0[:abstr0.find('1\n')]
    
    from transformers import pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(abstr, max_length=56)
    summary_text = summary[0]['summary_text']

    import torch
    import soundfile as sf
    from IPython.display import Audio
    from datasets import load_dataset

    from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

    inputs = processor(text=summary_text, return_tensors="pt")

    from datasets import load_dataset
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

    from transformers import SpeechT5HifiGan
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    with torch.no_grad():
        speech = vocoder(spectrogram)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    return Audio(speech, rate=16000)
    
import gradio as gr

iface = gr.Interface(fn=converti, inputs="file", outputs="audio")
iface.launch()