Spaces:

Sibinraj
/

dialogue_Text_Summarizer

Sleeping

App Files Files Community

dialogue_Text_Summarizer / app.py

Sibinraj

Update app.py

a758efc verified about 1 year ago

raw

history blame

3.81 kB

	import torch
	import gradio as gr
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	import fitz

	# Load the model and tokenizer
	model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
	model = T5ForConditionalGeneration.from_pretrained(model_path)
	tokenizer = T5Tokenizer.from_pretrained(model_path)

	def extract_text_from_pdf(pdf_path):
	"""
	Extracts text from a given PDF file.

	Args:
	pdf_path (str): Path to the PDF file.

	Returns:
	str: Extracted text from the PDF.
	"""
	text = ""
	with fitz.open(pdf_path) as doc:
	for page in doc:
	text += page.get_text()
	return text

	def summarize_text(text, max_length, show_length):
	"""
	Summarizes the given text using a T5 model.

	Args:
	text (str): The text to summarize.
	max_length (int): The maximum length of the summary.
	show_length (bool): Whether to show the length of the summary.

	Returns:
	str: The summarized text.
	"""
	inputs = tokenizer.encode(
	"summarize: " + text,
	return_tensors='pt',
	max_length=512,
	truncation=True,
	padding='max_length'
	)

	summary_ids = model.generate(
	inputs,
	max_length=max_length + 20, # Allow some buffer
	min_length=10, # Set a reasonable minimum length
	num_beams=5,
	no_repeat_ngram_size=2,
	early_stopping=True
	)

	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	summary_words = summary.split()
	if len(summary_words) > max_length:
	summary = ' '.join(summary_words[:max_length])
	elif len(summary_words) < max_length:
	additional_tokens = model.generate(
	tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
	max_length=max_length - len(summary_words) + len(summary_words),
	min_length=max_length - len(summary_words) + len(summary_words),
	num_beams=5,
	no_repeat_ngram_size=2,
	early_stopping=True
	)
	additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
	summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])

	if show_length:
	summary_length = len(summary.split())
	summary = f"{summary}\n\n(Summary length: {summary_length} words)"

	return summary

	def handle_input(input_type, text, pdf, max_length, show_length):
	"""
	Handles the user input based on the selected input type.

	Args:
	input_type (str): The type of input (text or PDF).
	text (str): The text input.
	pdf (UploadedFile): The uploaded PDF file.
	max_length (int): The maximum length of the summary.
	show_length (bool): Whether to show the length of the summary.

	Returns:
	str: The summarized text.
	"""
	if input_type == 'Text':
	return summarize_text(text, max_length, show_length)
	elif input_type == 'PDF':
	extracted_text = extract_text_from_pdf(pdf.name)
	return summarize_text(extracted_text, max_length, show_length)

	# Define the Gradio interface
	interface = gr.Interface(
	fn=handle_input,
	inputs=[
	gr.Radio(['Text', 'PDF'], label='Input Type', type='value'),
	gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input Text', visible=True),
	gr.File(label='Upload PDF', type='filepath', visible=True),
	gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
	gr.Checkbox(label='Show summary length', value=False)
	],
	outputs=gr.Textbox(label='Summarized Text'),
	title='Text or PDF Summarizer using T5-finetuned-dialogue_sumxx'
	)

	# Launch the Gradio interface
	interface.launch()