File size: 2,522 Bytes
8bb40ca
d11e31d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
pip install gradio PyMuPDF

import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz  # PyMuPDF

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    text = ""
    # Open the PDF file
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
    # Extract text from the PDF
    input_text = extract_text_from_pdf(pdf_file)

    # Tokenize the input to check length
    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')



    try:
        # Generate the summary
        summary_ids = new_model.generate(
            tokenized_input,
            max_length=max_summary_length,
            min_length=30,
            num_beams=15,
            repetition_penalty=5.0,
            no_repeat_ngram_size=2
        )

        # Decode the generated summary
        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Clean up the summary to remove unwanted tokens
        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()

        # Ensure the summary ends with a complete sentence
        if cleaned_summary:
            last_period_index = cleaned_summary.rfind('.')
            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
                cleaned_summary = cleaned_summary[:last_period_index + 1]
            else:
                cleaned_summary = cleaned_summary.strip()

        return cleaned_summary if cleaned_summary else "No valid summary generated."

    except Exception as e:
        return str(e)  # Return the error message for debugging

# Define the Gradio interface
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Slider(50, 300, step=10, label="Max summary length")
    ],
    outputs="textbox",  # A textbox for the output summary
    title="PDF Text Summarizer",
    description="Upload a PDF file to summarize its content."
)

# Launch the interface
# Launch the interface with debug mode enabled
interface.launch(debug=True)