Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import re | |
import gradio as gr | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
nltk.download('punkt') | |
def read_in_text(url): | |
with open(url, 'r') as file: | |
article = file.read() | |
return article | |
def clean_text(url): | |
text = url | |
#converting the text to all lower case | |
text = text.lower() | |
#removing the dates, time and name of author | |
text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text) | |
return text | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
import torch | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
print ("device ",device) | |
tokenizer_1 = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality") | |
model_1 = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality") | |
tokenizer_2 = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase') | |
model_2 = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device) | |
# Diverse Beam search | |
def my_paraphrase(sentence, model, tokenizer): | |
text = "paraphrase: "+sentence + " </s>" | |
encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True) | |
input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) | |
model.eval() | |
diverse_beam_outputs = model.generate( | |
input_ids=input_ids,attention_mask=attention_mask, | |
max_length = 512, | |
early_stopping=True, | |
num_beams=5, | |
num_beam_groups = 5, | |
num_return_sequences=5, | |
diversity_penalty = 0.70 | |
) | |
sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) | |
return sent | |
def return_output(file, models): | |
docs = file | |
sentence = clean_text(docs) | |
if models == 'T5': | |
model = model_1 | |
tokenizer = tokenizer_1 | |
elif models == 'Pegasus': | |
model = model_2 | |
tokenizer = tokenizer_2 | |
output = " ".join([my_paraphrase(sent, model, tokenizer) for sent in sent_tokenize(sentence)]) | |
new_output = output.replace('paraphrasedoutput:', "") | |
new_output = new_output.replace('.<n>', '.\n') | |
return new_output | |
demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False), | |
gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),], | |
outputs=[gr.outputs.Textbox(label="Summary")]) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |