Spaces:
Runtime error
Runtime error
import gradio as gr | |
import transformers | |
from transformers import pipeline | |
import PyPDF2 | |
import pdfplumber | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure | |
import re | |
import torch | |
from datasets import load_dataset | |
import soundfile as sf | |
from IPython.display import Audio | |
import numpy as np | |
from datasets import load_dataset | |
import sentencepiece as spm | |
import os | |
import tempfile | |
def text_extraction(element): | |
# Extracting the text from the in-line text element | |
line_text = element.get_text() | |
# Find the formats of the text | |
# Initialize the list with all the formats that appeared in the line of text | |
line_formats = [] | |
for text_line in element: | |
if isinstance(text_line, LTTextContainer): | |
# Iterating through each character in the line of text | |
for character in text_line: | |
if isinstance(character, LTChar): | |
# Append the font name of the character | |
line_formats.append(character.fontname) | |
# Append the font size of the character | |
line_formats.append(character.size) | |
# Find the unique font sizes and names in the line | |
format_per_line = list(set(line_formats)) | |
# Return a tuple with the text in each line along with its format | |
return (line_text, format_per_line) | |
def read_pdf(pdf_pathy): | |
# create a PDF file object | |
pdfFileObj = open(pdf_pathy, 'rb') | |
# create a PDF reader object | |
pdfReaded = PyPDF2.PdfReader(pdfFileObj) | |
# Create the dictionary to extract text from each image | |
text_per_pagy = {} | |
# We extract the pages from the PDF | |
for pagenum, page in enumerate(extract_pages(pdf_pathy)): | |
print("Elaborating Page_" +str(pagenum)) | |
# Initialize the variables needed for the text extraction from the page | |
pageObj = pdfReaded.pages[pagenum] | |
page_text = [] | |
line_format = [] | |
page_content = [] | |
# Open the pdf file | |
pdf = pdfplumber.open(pdf_pathy) | |
# Find all the elements | |
page_elements = [(element.y1, element) for element in page._objs] | |
# Sort all the elements as they appear in the page | |
page_elements.sort(key=lambda a: a[0], reverse=True) | |
# Find the elements that composed a page | |
for i,component in enumerate(page_elements): | |
# Extract the position of the top side of the element in the PDF | |
pos= component[0] | |
# Extract the element of the page layout | |
element = component[1] | |
# Check if the element is a text element | |
if isinstance(element, LTTextContainer): | |
# Check if the text appeared in a table | |
# Use the function to extract the text and format for each text element | |
(line_text, format_per_line) = text_extraction(element) | |
# Append the text of each line to the page text | |
page_text.append(line_text) | |
# Append the format for each line containing text | |
line_format.append(format_per_line) | |
page_content.append(line_text) | |
# Create the key of the dictionary | |
dctkey = 'Page_'+str(pagenum) | |
# Add the list of list as the value of the page key | |
text_per_pagy[dctkey]= [page_text, line_format, page_content] | |
# Closing the pdf file object | |
pdfFileObj.close() | |
return text_per_pagy | |
#performing a cleaning of the contents | |
import re | |
def clean_text(text): | |
# remove extra spaces | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def extract_abstract(text_per_pagy): | |
abstract_text = "" | |
#iterate through each page in the extracted text dictionary | |
for page_num, page_text in text_per_pagy.items(): | |
if page_text: | |
# Replace hyphens used for line breaks | |
page_text = page_text.replace("- ", "") | |
# Looking for the start of the abstract | |
start_index = page_text.find("Abstract") | |
if start_index != -1: | |
# Adjust the start index to exclude the word "Abstract" itself | |
# The length of "Abstract" is 8 characters; we also add 1 to skip the space after it | |
start_index += len("Abstract") + 1 | |
# Searching the possible end markers of the abstract | |
end_markers = ["Introduction", "Summary", "Overview", "Background"] | |
end_index = -1 | |
for marker in end_markers: | |
temp_index = page_text.find(marker, start_index) | |
if temp_index != -1: | |
end_index = temp_index | |
break | |
# If no end marker found, take entire text after "Abstract" | |
if end_index == -1: | |
end_index = len(page_text) | |
# Extract the abstract text | |
abstract = page_text[start_index:end_index].strip() | |
# Add the abstract to the complete text | |
abstract_text += " " + abstract | |
break | |
return abstract_text | |
def main_function(uploaded_filepath): | |
#a control to see if there is a file uploaded | |
if uploaded_filepath is None: | |
return "No file loaded", None | |
#read and process the file | |
text_per_pagy = read_pdf(uploaded_filepath) | |
#cleaning the text and getting the abstract | |
for key, value in text_per_pagy.items(): | |
cleaned_text = clean_text(' '.join(value[0])) | |
text_per_pagy[key] = cleaned_text | |
abstract_text = extract_abstract(text_per_pagy) | |
#abstract summary | |
summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") | |
summary = summarizer(abstract_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text'] | |
#generating the audio from the text, with my pipeline and model | |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding}) | |
#saving the audio in a temp file | |
audio_file_path = "summary.wav" | |
sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) | |
#the function returns the 2 pieces we need | |
return summary, audio_file_path | |
iface = gr.Interface( | |
fn=main_function, | |
inputs=gr.File(type="filepath"), # Cambiato da "pdf" a "file" | |
outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")] | |
) | |
# Avvia l'app | |
if __name__ == "__main__": | |
iface.launch() | |