Spaces:
Runtime error
Runtime error
File size: 6,786 Bytes
c14f8d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import gradio as gr
import transformers
from transformers import pipeline
import PyPDF2
import pdfplumber
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import re
import torch
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
import numpy as np
from datasets import load_dataset
import sentencepiece as spm
import os
import tempfile
def text_extraction(element):
# Extracting the text from the in-line text element
line_text = element.get_text()
# Find the formats of the text
# Initialize the list with all the formats that appeared in the line of text
line_formats = []
for text_line in element:
if isinstance(text_line, LTTextContainer):
# Iterating through each character in the line of text
for character in text_line:
if isinstance(character, LTChar):
# Append the font name of the character
line_formats.append(character.fontname)
# Append the font size of the character
line_formats.append(character.size)
# Find the unique font sizes and names in the line
format_per_line = list(set(line_formats))
# Return a tuple with the text in each line along with its format
return (line_text, format_per_line)
def read_pdf(pdf_pathy):
# create a PDF file object
pdfFileObj = open(pdf_pathy, 'rb')
# create a PDF reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
# Create the dictionary to extract text from each image
text_per_pagy = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_pathy)):
print("Elaborating Page_" +str(pagenum))
# Initialize the variables needed for the text extraction from the page
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
page_content = []
# Open the pdf file
pdf = pdfplumber.open(pdf_pathy)
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the elements as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i,component in enumerate(page_elements):
# Extract the position of the top side of the element in the PDF
pos= component[0]
# Extract the element of the page layout
element = component[1]
# Check if the element is a text element
if isinstance(element, LTTextContainer):
# Check if the text appeared in a table
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
# Create the key of the dictionary
dctkey = 'Page_'+str(pagenum)
# Add the list of list as the value of the page key
text_per_pagy[dctkey]= [page_text, line_format, page_content]
# Closing the pdf file object
pdfFileObj.close()
return text_per_pagy
#performing a cleaning of the contents
import re
def clean_text(text):
# remove extra spaces
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_abstract(text_per_pagy):
abstract_text = ""
#iterate through each page in the extracted text dictionary
for page_num, page_text in text_per_pagy.items():
if page_text:
# Replace hyphens used for line breaks
page_text = page_text.replace("- ", "")
# Looking for the start of the abstract
start_index = page_text.find("Abstract")
if start_index != -1:
# Adjust the start index to exclude the word "Abstract" itself
# The length of "Abstract" is 8 characters; we also add 1 to skip the space after it
start_index += len("Abstract") + 1
# Searching the possible end markers of the abstract
end_markers = ["Introduction", "Summary", "Overview", "Background"]
end_index = -1
for marker in end_markers:
temp_index = page_text.find(marker, start_index)
if temp_index != -1:
end_index = temp_index
break
# If no end marker found, take entire text after "Abstract"
if end_index == -1:
end_index = len(page_text)
# Extract the abstract text
abstract = page_text[start_index:end_index].strip()
# Add the abstract to the complete text
abstract_text += " " + abstract
break
return abstract_text
def main_function(uploaded_filepath):
#a control to see if there is a file uploaded
if uploaded_filepath is None:
return "No file loaded", None
#read and process the file
text_per_pagy = read_pdf(uploaded_filepath)
#cleaning the text and getting the abstract
for key, value in text_per_pagy.items():
cleaned_text = clean_text(' '.join(value[0]))
text_per_pagy[key] = cleaned_text
abstract_text = extract_abstract(text_per_pagy)
#abstract summary
summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
summary = summarizer(abstract_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
#generating the audio from the text, with my pipeline and model
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding})
#saving the audio in a temp file
audio_file_path = "summary.wav"
sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])
#the function returns the 2 pieces we need
return summary, audio_file_path
iface = gr.Interface(
fn=main_function,
inputs=gr.File(type="filepath"), # Cambiato da "pdf" a "file"
outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")]
)
# Avvia l'app
if __name__ == "__main__":
iface.launch()
|