File size: 3,087 Bytes
9b3a68b
8ce4bd9
cc7fa79
8ce4bd9
 
 
 
9c73e01
8ce4bd9
 
9c73e01
8ce4bd9
9c73e01
0cdfeaa
520e96b
55e862e
8ce4bd9
 
 
 
 
 
8328d05
8ce4bd9
9c73e01
0cdfeaa
674f46b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cdfeaa
badf9d9
674f46b
 
 
 
badf9d9
674f46b
 
0cdfeaa
9c73e01
 
 
 
 
8ce4bd9
9c73e01
 
 
 
8ce4bd9
9c73e01
 
 
8ce4bd9
9c73e01
8ce4bd9
 
 
 
b303ff0
8ce4bd9
 
9c73e01
8ce4bd9
 
 
81398e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#Required imports
import gradio as gr
import fitz  # PyMuPDF
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
import scipy.io.wavfile
import numpy as np

# Initialize tokenizers and models
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
synthesiser = pipeline("text-to-speech", "suno/bark")

# Function to extract abstract from PDF
def extract_abstract(pdf_bytes):
    # Open PDF with fitz
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    first_page = doc[0].get_text()
    start_idx = first_page.lower().find("abstract")
    end_idx = first_page.lower().find("introduction")
    if start_idx != -1 and end_idx != -1:
        return first_page[start_idx:end_idx].strip()
    else:
        return "Abstract not found or 'Introduction' not found in the first page."

# Function to process text (summarize and convert to speech)
def process_text(uploaded_file):
    # Debugging: Check the type and content of the uploaded file
    file_info = f"File type: {type(uploaded_file)}, File keys: {list(uploaded_file.keys())}"
    
    # Check if 'data' key exists
    if 'data' in uploaded_file:
        file_info += f", Data type: {type(uploaded_file['data'])}, Data length: {len(uploaded_file['data'])}"
    else:
        file_info += ", 'data' key not found"

    # For debugging, return the file info instead of processing the file
    return file_info, "temp_output.wav"

        
# Function to process text (summarize and convert to speech)
#def process_text(uploaded_file):
    # Extract the file data (byte content) from the uploaded file
    # Check if 'content' key exists, and use it to access the file's byte content
#    if "content" in uploaded_file:
#        pdf_bytes = uploaded_file["data"]
#    else:
#        return "File content could not be retrieved", None


    
    abstract_text = extract_abstract(pdf_bytes)
    
    # Generate summary
    inputs = tokenizer([abstract_text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=40, min_length=10, length_penalty=2.0, early_stopping=True, no_repeat_ngram_size=2)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Convert summary to speech
    speech = synthesiser(summary, forward_params={"do_sample": True})
    audio_data = speech["audio"].squeeze()
    normalized_audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)

    # Save audio to temporary file
    output_file = "temp_output.wav"
    scipy.io.wavfile.write(output_file, rate=speech["sampling_rate"], data=normalized_audio_data)

    return summary, output_file

# Gradio Interface
iface = gr.Interface(
    fn=process_text,
    inputs=gr.components.File(label="Upload PDF"),
    outputs=["text", "audio"],
    title="Summarization and Text-to-Speech",
    description="Upload a PDF to extract, summarize its abstract, and convert to speech."
)

if __name__ == "__main__":
    iface.launch()