File size: 1,676 Bytes
26ce009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aff5bd1
26ce009
 
aff5bd1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re

model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_first_sentence(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    if sentences:
        return sentences[0]
    else:
        return text

def summarize_pdf_abstract(pdf_file):
    try:
        reader = PdfReader(pdf_file)
        abstract_text = ""
        for page in reader.pages:
            if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
                abstract_text = page.extract_text()
                break

        inputs = tokenizer(abstract_text, return_tensors="pt")
        outputs = model.generate(**inputs)
        summary = tokenizer.decode(outputs[0])

        # Extract only the first sentence
        summary_sentence = extract_first_sentence(summary)

        # Generate audio
        speech = gTTS(text=summary_sentence, lang="en")
        speech_bytes = BytesIO()
        speech.write_to_fp(speech_bytes)

        # Return individual output values
        return summary_sentence, speech_bytes.getvalue()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=summarize_pdf_abstract,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    description="This app summarizes the abstract of a PDF in one sentence.",
)

interface.launch(share=True)