File size: 4,198 Bytes
85ebbdc
 
 
3abff09
8ce4bd9
cc7fa79
8ce4bd9
 
 
 
85ebbdc
 
3abff09
 
 
9c73e01
8ce4bd9
 
9c73e01
8ce4bd9
0cdfeaa
3abff09
 
 
 
 
 
 
 
 
 
 
 
 
0cdfeaa
b178a19
 
7e93398
7e5f9d1
 
 
 
b178a19
7e5f9d1
674f46b
3abff09
 
b178a19
3abff09
 
 
badf9d9
3abff09
1842832
 
 
b178a19
 
1842832
c7b889d
1842832
 
 
 
 
3abff09
8ce4bd9
b178a19
 
 
 
 
 
78a8026
b178a19
 
 
 
 
78a8026
 
 
 
 
 
3abff09
 
8ce4bd9
3abff09
 
8ce4bd9
78a8026
3abff09
 
 
8ce4bd9
 
 
69486d7
8ce4bd9
69486d7
f0137b9
8ce4bd9
 
 
4605b3c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# https://huggingface.co/spaces/amendolajine/OPIT

# Here are the imports
import logging
import gradio as gr
import fitz  # PyMuPDF
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
import scipy.io.wavfile
import numpy as np

# Here is the code

# Initialize logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize tokenizers and models
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
synthesiser = pipeline("text-to-speech", "suno/bark")

def extract_abstract(pdf_bytes):
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        first_page = doc[0].get_text()
        start_idx = first_page.lower().find("abstract")
        end_idx = first_page.lower().find("introduction")
        if start_idx != -1 and end_idx != -1:
            return first_page[start_idx:end_idx].strip()
        else:
            return "Abstract not found or 'Introduction' not found in the first page."
    except Exception as e:
        logging.error(f"Error extracting abstract: {e}")
        return "Error in abstract extraction"

def process_text(uploaded_file):
    logging.debug(f"Uploaded file type: {type(uploaded_file)}")
    logging.debug(f"Uploaded file content: {uploaded_file}")

    try:
        with open(uploaded_file, "rb") as file:
            pdf_bytes = file.read()
    except Exception as e:
        logging.error(f"Error reading file from path: {e}")
        return "Error reading PDF file", None

    try:
        abstract_text = extract_abstract(pdf_bytes)
        logging.info(f"Extracted abstract: {abstract_text[:200]}...")
    except Exception as e:
        logging.error(f"Error in abstract extraction: {e}")
        return "Error in processing PDF", None

    try:
        inputs = tokenizer([abstract_text], max_length=1024, return_tensors='pt', truncation=True, padding="max_length")
        summary_ids = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            pad_token_id=model.config.pad_token_id,
            num_beams=4,
            max_length=45,
            min_length=10,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        words = summary.split()
        cleaned_summary = []
        for i, word in enumerate(words):
            if '-' in word and i < len(words) - 1:
                word = word.replace('-', '') + words[i + 1]
                words[i + 1] = ""

            if '.' in word and i != len(words) - 1:
                word = word.replace('.', '')
                cleaned_summary.append(word + ' and')
            else:
                cleaned_summary.append(word) 

        final_summary = ' '.join(cleaned_summary)
        final_summary = final_summary[0].upper() + final_summary[1:]
        final_summary = ' '.join(w[0].lower() + w[1:] if w.lower() != 'and' else w for w in final_summary.split())

        speech = synthesiser(final_summary, forward_params={"do_sample": True})
        audio_data = speech["audio"].squeeze()
        normalized_audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)

        output_file = "temp_output.wav"
        scipy.io.wavfile.write(output_file, rate=speech["sampling_rate"], data=normalized_audio_data)

        return final_summary, output_file
    except Exception as e:
        logging.error(f"Error in summary generation or TTS conversion: {e}")
        return "Error in summary or speech generation", None

iface = gr.Interface(
    fn=process_text,
    inputs=gr.components.File(label="Upload a research PDF containing an abstract"),
    outputs=["text", "audio"],
    title="Summarize an abstract and vocalize it",
    description="Upload a research paper in PDF format to extract, summarize its abstract, and convert the summarization to speech. If the upload doesn't work on the first try, refresh the page (CTRL+F5) and try again."
)

if __name__ == "__main__":
    iface.launch()