Sibinraj's picture
Update app.py
a758efc verified
raw
history blame
3.81 kB
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz
# Load the model and tokenizer
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a given PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def summarize_text(text, max_length, show_length):
"""
Summarizes the given text using a T5 model.
Args:
text (str): The text to summarize.
max_length (int): The maximum length of the summary.
show_length (bool): Whether to show the length of the summary.
Returns:
str: The summarized text.
"""
inputs = tokenizer.encode(
"summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True,
padding='max_length'
)
summary_ids = model.generate(
inputs,
max_length=max_length + 20, # Allow some buffer
min_length=10, # Set a reasonable minimum length
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary_words = summary.split()
if len(summary_words) > max_length:
summary = ' '.join(summary_words[:max_length])
elif len(summary_words) < max_length:
additional_tokens = model.generate(
tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
max_length=max_length - len(summary_words) + len(summary_words),
min_length=max_length - len(summary_words) + len(summary_words),
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
if show_length:
summary_length = len(summary.split())
summary = f"{summary}\n\n(Summary length: {summary_length} words)"
return summary
def handle_input(input_type, text, pdf, max_length, show_length):
"""
Handles the user input based on the selected input type.
Args:
input_type (str): The type of input (text or PDF).
text (str): The text input.
pdf (UploadedFile): The uploaded PDF file.
max_length (int): The maximum length of the summary.
show_length (bool): Whether to show the length of the summary.
Returns:
str: The summarized text.
"""
if input_type == 'Text':
return summarize_text(text, max_length, show_length)
elif input_type == 'PDF':
extracted_text = extract_text_from_pdf(pdf.name)
return summarize_text(extracted_text, max_length, show_length)
# Define the Gradio interface
interface = gr.Interface(
fn=handle_input,
inputs=[
gr.Radio(['Text', 'PDF'], label='Input Type', type='value'),
gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input Text', visible=True),
gr.File(label='Upload PDF', type='filepath', visible=True),
gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
gr.Checkbox(label='Show summary length', value=False)
],
outputs=gr.Textbox(label='Summarized Text'),
title='Text or PDF Summarizer using T5-finetuned-dialogue_sumxx'
)
# Launch the Gradio interface
interface.launch()