mrsk1883's picture
Update app.py
cf20f36
raw
history blame
2.04 kB
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def extract_first_sentence(text):
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
if sentences:
return sentences[0]
else:
return text
def summarize_pdf_abstract(pdf_file):
try:
reader = PdfReader(pdf_file)
abstract_text = ""
for page in reader.pages:
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
abstract_text = page.extract_text()
break
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0])
# Extract only the first sentence
summary_sentence = extract_first_sentence(summary)
# Generate audio
speech = gTTS(text=summary_sentence, lang="en")
speech_bytes = BytesIO()
speech.write_to_fp(speech_bytes)
# Return individual output values
return summary_sentence, speech_bytes.getvalue()
except Exception as e:
raise Exception(str(e))
interface = gr.Interface(
fn=summarize_pdf_abstract,
inputs=[gr.File(label="Upload PDF")],
outputs=[gr.Textbox(label="Summary"), gr.Audio()],
title="PDF Summarization & Audio Tool",
description="""PDF Summarization App. This app summarizes the abstract of a PDF in one sentence and generates an audio of it. Only upload PDF's with Abstracts
Please read the README.MD for information about the app and sample PDFs.""",
examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],cache_examples=True,
)
interface.launch(share=True)