Spaces:
Runtime error
Runtime error
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
import os | |
model_name = "ArtifactAI/led_large_16384_arxiv_summarization" | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def extract_first_sentence(text): | |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
if sentences: | |
return sentences[0] | |
else: | |
return text | |
def summarize_pdf_abstract(pdf_file): | |
try: | |
reader = PdfReader(pdf_file) | |
abstract_text = "" | |
for page in reader.pages: | |
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text(): | |
abstract_text = page.extract_text() | |
break | |
inputs = tokenizer(abstract_text, return_tensors="pt") | |
outputs = model.generate(**inputs) | |
summary = tokenizer.decode(outputs[0]) | |
# Extract only the first sentence | |
summary_sentence = extract_first_sentence(summary) | |
# Generate audio | |
speech = gTTS(text=summary_sentence, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return summary_sentence, speech_bytes.getvalue() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=summarize_pdf_abstract, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Tool", | |
description="""PDF Summarization App. This app summarizes the abstract of a PDF in one sentence and generates an audio of it. Only upload PDF's with Abstracts | |
Please read the README.MD for information about the app and sample PDFs.""", | |
examples=[os.path.join(os.path.dirname(__file__), "Hidden Technical Debt in MLSystems.pdf")], | |
) | |
interface.launch(share=True) |