Spaces:
Sleeping
Sleeping
File size: 2,406 Bytes
898d64f cbbfdce 21b05f2 599f557 6b2f148 2a0f339 5d68fbb cbbfdce 599f557 6b2f148 5797bd3 543501a 5797bd3 599f557 5797bd3 90c4eb5 5797bd3 543501a 6b2f148 90c4eb5 6b2f148 cbbfdce 599f557 fd7fbfb 599f557 fd7fbfb 599f557 6b2f148 599f557 fd7fbfb 599f557 fd7fbfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz # PyMuPDF
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def summarize_text(text, max_length, show_length):
inputs = tokenizer.encode(
"summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True,
padding='max_length'
)
summary_ids = model.generate(
inputs,
max_length=max_length + 20,
min_length=10,
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary_words = summary.split()
if len(summary_words) > max_length:
summary = ' '.join(summary_words[:max_length])
elif len(summary_words) < max_length:
additional_tokens = model.generate(
tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
max_length=max_length - len(summary_words) + len(summary_words),
min_length=max_length - len(summary_words) + len(summary_words),
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
if show_length:
summary_length = len(summary.split())
summary = f"{summary}\n\n(Summary length: {summary_length} words)"
return summary
def handle_pdf(pdf, max_length, show_length):
text = extract_text_from_pdf(pdf.name)
return summarize_text(text, max_length, show_length)
interface = gr.Interface(
fn=handle_pdf,
inputs=[
gr.File(label='Upload PDF', type='file'),
gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
gr.Checkbox(label='Show summary length', value=False)
],
outputs=gr.Textbox(label='Summarized Text'),
title='PDF Text Summarizer using T5-finetuned-dialogue_sumxx'
)
interface.launch()
|