Spaces:
Runtime error
Runtime error
File size: 2,744 Bytes
a2e9054 8032e6a a2e9054 94882e0 a2e9054 94882e0 a2e9054 94882e0 a2e9054 d22e3c2 a2e9054 a832dbc a2e9054 036b597 a2e9054 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import pandas as pd
import numpy as np
import re
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
def read_in_text(url):
with open(url, 'r') as file:
article = file.read()
return article
def clean_text(url):
text = url
#converting the text to all lower case
text = text.lower()
#removing the dates, time and name of author
text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
return text
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print ("device ",device)
tokenizer_1 = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
model_1 = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
tokenizer_2 = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
model_2 = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device)
# Diverse Beam search
def my_paraphrase(sentence, model, tokenizer):
text = "paraphrase: "+sentence + " </s>"
encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True)
input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
model.eval()
diverse_beam_outputs = model.generate(
input_ids=input_ids,attention_mask=attention_mask,
max_length = 512,
early_stopping=True,
num_beams=5,
num_beam_groups = 5,
num_return_sequences=5,
diversity_penalty = 0.70
)
sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
return sent
def return_output(file, models):
docs = file
sentence = clean_text(docs)
if models == 'T5':
model = model_1
tokenizer = tokenizer_1
elif models == 'Pegasus':
model = model_2
tokenizer = tokenizer_2
output = " ".join([my_paraphrase(sent, model, tokenizer) for sent in sent_tokenize(sentence)])
new_output = output.replace('paraphrasedoutput:', "")
new_output = new_output.replace('.<n>', '.\n')
return new_output
demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False),
gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),],
outputs=[gr.outputs.Textbox(label="Summary")])
if __name__ == "__main__":
demo.launch(debug=True) |