Spaces:
Sleeping
Sleeping
import torch | |
import gradio as gr | |
from transformers import T5ForConditionalGeneration, T5Tokenizer | |
import fitz | |
# Load the model and tokenizer | |
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx' | |
model = T5ForConditionalGeneration.from_pretrained(model_path) | |
tokenizer = T5Tokenizer.from_pretrained(model_path) | |
def extract_text_from_pdf(pdf_path): | |
""" | |
Extracts text from a given PDF file. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
Returns: | |
str: Extracted text from the PDF. | |
""" | |
text = "" | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
return text | |
def summarize_text(text, max_length, show_length): | |
""" | |
Summarizes the given text using a T5 model. | |
Args: | |
text (str): The text to summarize. | |
max_length (int): The maximum length of the summary. | |
show_length (bool): Whether to show the length of the summary. | |
Returns: | |
str: The summarized text. | |
""" | |
inputs = tokenizer.encode( | |
"summarize: " + text, | |
return_tensors='pt', | |
max_length=512, | |
truncation=True, | |
padding='max_length' | |
) | |
summary_ids = model.generate( | |
inputs, | |
max_length=max_length + 20, # Allow some buffer | |
min_length=10, # Set a reasonable minimum length | |
num_beams=5, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
summary_words = summary.split() | |
if len(summary_words) > max_length: | |
summary = ' '.join(summary_words[:max_length]) | |
elif len(summary_words) < max_length: | |
additional_tokens = model.generate( | |
tokenizer.encode(" ".join(summary_words), return_tensors='pt'), | |
max_length=max_length - len(summary_words) + len(summary_words), | |
min_length=max_length - len(summary_words) + len(summary_words), | |
num_beams=5, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True) | |
summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length]) | |
if show_length: | |
summary_length = len(summary.split()) | |
summary = f"{summary}\n\n(Summary length: {summary_length} words)" | |
return summary | |
def handle_input(input_type, text, pdf, max_length, show_length): | |
""" | |
Handles the user input based on the selected input type. | |
Args: | |
input_type (str): The type of input (text or PDF). | |
text (str): The text input. | |
pdf (UploadedFile): The uploaded PDF file. | |
max_length (int): The maximum length of the summary. | |
show_length (bool): Whether to show the length of the summary. | |
Returns: | |
str: The summarized text. | |
""" | |
if input_type == 'Text': | |
return summarize_text(text, max_length, show_length) | |
elif input_type == 'PDF': | |
extracted_text = extract_text_from_pdf(pdf.name) | |
return summarize_text(extracted_text, max_length, show_length) | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=handle_input, | |
inputs=[ | |
gr.Radio(['Text', 'PDF'], label='Input Type', type='value'), | |
gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input Text', visible=True), | |
gr.File(label='Upload PDF', type='filepath', visible=True), | |
gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'), | |
gr.Checkbox(label='Show summary length', value=False) | |
], | |
outputs=gr.Textbox(label='Summarized Text'), | |
title='Text or PDF Summarizer using T5-finetuned-dialogue_sumxx' | |
) | |
# Launch the Gradio interface | |
interface.launch() | |