Summarizer / app.py
eevaw's picture
Update app.py
8bb40ca verified
raw
history blame
2.52 kB
pip install gradio PyMuPDF
import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz # PyMuPDF
# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
text = ""
# Open the PDF file
with fitz.open(pdf_file) as doc:
for page in doc:
text += page.get_text() # Extract text from each page
return text
# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
# Extract text from the PDF
input_text = extract_text_from_pdf(pdf_file)
# Tokenize the input to check length
tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
try:
# Generate the summary
summary_ids = new_model.generate(
tokenized_input,
max_length=max_summary_length,
min_length=30,
num_beams=15,
repetition_penalty=5.0,
no_repeat_ngram_size=2
)
# Decode the generated summary
summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Clean up the summary to remove unwanted tokens
cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
# Ensure the summary ends with a complete sentence
if cleaned_summary:
last_period_index = cleaned_summary.rfind('.')
if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
cleaned_summary = cleaned_summary[:last_period_index + 1]
else:
cleaned_summary = cleaned_summary.strip()
return cleaned_summary if cleaned_summary else "No valid summary generated."
except Exception as e:
return str(e) # Return the error message for debugging
# Define the Gradio interface
interface = gr.Interface(
fn=summarize_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Slider(50, 300, step=10, label="Max summary length")
],
outputs="textbox", # A textbox for the output summary
title="PDF Text Summarizer",
description="Upload a PDF file to summarize its content."
)
# Launch the interface
# Launch the interface with debug mode enabled
interface.launch(debug=True)