Spaces:
Runtime error
Runtime error
pip install gradio PyMuPDF | |
import gradio as gr | |
from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
import fitz # PyMuPDF | |
# Load the fine-tuned tokenizer and model | |
model_name = "fine-tuned-mt5" | |
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) | |
new_model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
# Function to extract text from PDF using PyMuPDF | |
def extract_text_from_pdf(pdf_file): | |
text = "" | |
# Open the PDF file | |
with fitz.open(pdf_file) as doc: | |
for page in doc: | |
text += page.get_text() # Extract text from each page | |
return text | |
# Summarization function | |
def summarize_pdf(pdf_file, max_summary_length): | |
# Extract text from the PDF | |
input_text = extract_text_from_pdf(pdf_file) | |
# Tokenize the input to check length | |
tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt') | |
try: | |
# Generate the summary | |
summary_ids = new_model.generate( | |
tokenized_input, | |
max_length=max_summary_length, | |
min_length=30, | |
num_beams=15, | |
repetition_penalty=5.0, | |
no_repeat_ngram_size=2 | |
) | |
# Decode the generated summary | |
summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
# Clean up the summary to remove unwanted tokens | |
cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip() | |
# Ensure the summary ends with a complete sentence | |
if cleaned_summary: | |
last_period_index = cleaned_summary.rfind('.') | |
if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1: | |
cleaned_summary = cleaned_summary[:last_period_index + 1] | |
else: | |
cleaned_summary = cleaned_summary.strip() | |
return cleaned_summary if cleaned_summary else "No valid summary generated." | |
except Exception as e: | |
return str(e) # Return the error message for debugging | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=summarize_pdf, | |
inputs=[ | |
gr.File(label="Upload PDF"), | |
gr.Slider(50, 300, step=10, label="Max summary length") | |
], | |
outputs="textbox", # A textbox for the output summary | |
title="PDF Text Summarizer", | |
description="Upload a PDF file to summarize its content." | |
) | |
# Launch the interface | |
# Launch the interface with debug mode enabled | |
interface.launch(debug=True) |