Spaces:
Sleeping
Sleeping
File size: 3,808 Bytes
898d64f cbbfdce 21b05f2 a758efc 6b2f148 948b6c3 6b2f148 2a0f339 5d68fbb cbbfdce 599f557 948b6c3 599f557 6b2f148 948b6c3 5797bd3 543501a 5797bd3 948b6c3 a758efc 5797bd3 90c4eb5 5797bd3 543501a 6b2f148 90c4eb5 6b2f148 cbbfdce a758efc 948b6c3 a758efc 948b6c3 a758efc 948b6c3 a758efc 599f557 948b6c3 fd7fbfb a758efc fd7fbfb a758efc 6b2f148 599f557 fd7fbfb a758efc fd7fbfb 948b6c3 fd7fbfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz
# Load the model and tokenizer
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a given PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def summarize_text(text, max_length, show_length):
"""
Summarizes the given text using a T5 model.
Args:
text (str): The text to summarize.
max_length (int): The maximum length of the summary.
show_length (bool): Whether to show the length of the summary.
Returns:
str: The summarized text.
"""
inputs = tokenizer.encode(
"summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True,
padding='max_length'
)
summary_ids = model.generate(
inputs,
max_length=max_length + 20, # Allow some buffer
min_length=10, # Set a reasonable minimum length
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary_words = summary.split()
if len(summary_words) > max_length:
summary = ' '.join(summary_words[:max_length])
elif len(summary_words) < max_length:
additional_tokens = model.generate(
tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
max_length=max_length - len(summary_words) + len(summary_words),
min_length=max_length - len(summary_words) + len(summary_words),
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
if show_length:
summary_length = len(summary.split())
summary = f"{summary}\n\n(Summary length: {summary_length} words)"
return summary
def handle_input(input_type, text, pdf, max_length, show_length):
"""
Handles the user input based on the selected input type.
Args:
input_type (str): The type of input (text or PDF).
text (str): The text input.
pdf (UploadedFile): The uploaded PDF file.
max_length (int): The maximum length of the summary.
show_length (bool): Whether to show the length of the summary.
Returns:
str: The summarized text.
"""
if input_type == 'Text':
return summarize_text(text, max_length, show_length)
elif input_type == 'PDF':
extracted_text = extract_text_from_pdf(pdf.name)
return summarize_text(extracted_text, max_length, show_length)
# Define the Gradio interface
interface = gr.Interface(
fn=handle_input,
inputs=[
gr.Radio(['Text', 'PDF'], label='Input Type', type='value'),
gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input Text', visible=True),
gr.File(label='Upload PDF', type='filepath', visible=True),
gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
gr.Checkbox(label='Show summary length', value=False)
],
outputs=gr.Textbox(label='Summarized Text'),
title='Text or PDF Summarizer using T5-finetuned-dialogue_sumxx'
)
# Launch the Gradio interface
interface.launch()
|