File size: 2,225 Bytes
9c73e01
8ce4bd9
9c73e01
8ce4bd9
81398e7
8ce4bd9
 
 
9c73e01
8ce4bd9
9c73e01
8ce4bd9
 
9c73e01
8ce4bd9
9c73e01
 
 
8ce4bd9
 
 
 
 
 
 
 
9c73e01
 
 
 
 
 
 
 
8ce4bd9
9c73e01
 
 
 
8ce4bd9
9c73e01
 
 
8ce4bd9
9c73e01
8ce4bd9
 
 
 
9c73e01
8ce4bd9
 
9c73e01
8ce4bd9
 
 
81398e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Initial installations handled separately (not in app.py)

# Required imports
import gradio as gr
import PyMuPDF as fitz
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
import scipy.io.wavfile
import numpy as np
from IPython.display import Audio

# Initialize tokenizers and models
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
synthesiser = pipeline("text-to-speech", "suno/bark")

# Function to extract abstract from PDF
def extract_abstract(pdf_content):
    doc = fitz.open("pdf", pdf_content)
    first_page = doc[0].get_text()
    start_idx = first_page.lower().find("abstract")
    end_idx = first_page.lower().find("introduction")
    if start_idx != -1 and end_idx != -1:
        return first_page[start_idx:end_idx].strip()
    else:
        return "Abstract not found or '1 Introduction' not found in the first page."

# Function to process text (summarize and convert to speech)
def process_text(pdf_content):
    abstract_text = extract_abstract(pdf_content)
    
    # Generate summary
    inputs = tokenizer([abstract_text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=40, min_length=10, length_penalty=2.0, early_stopping=True, no_repeat_ngram_size=2)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Convert summary to speech
    speech = synthesiser(summary, forward_params={"do_sample": True})
    audio_data = speech["audio"].squeeze()
    normalized_audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)

    # Save audio to temporary file
    output_file = "temp_output.wav"
    scipy.io.wavfile.write(output_file, rate=speech["sampling_rate"], data=normalized_audio_data)

    return summary, output_file

# Gradio Interface
iface = gr.Interface(
    fn=process_text,
    inputs=gr.inputs.File(label="Upload PDF"),
    outputs=["text", "audio"],
    title="Summarization and Text-to-Speech",
    description="Upload a PDF to extract, summarize its abstract, and convert to speech."
)

if __name__ == "__main__":
    iface.launch()