Spaces:
Runtime error
Runtime error
abdulmatinomotoso
commited on
Commit
•
a2e9054
1
Parent(s):
5742cf8
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import gradio as gr
|
5 |
+
import nltk
|
6 |
+
from nltk.tokenize import sent_tokenize
|
7 |
+
nltk.download('punkt')
|
8 |
+
|
9 |
+
def read_in_text(url):
|
10 |
+
with open(url, 'r') as file:
|
11 |
+
article = file.read()
|
12 |
+
return article
|
13 |
+
|
14 |
+
def clean_text(url):
|
15 |
+
text = read_in_text(url)
|
16 |
+
#converting the text to all lower case
|
17 |
+
text = text.lower()
|
18 |
+
|
19 |
+
#removing the dates, time and name of author
|
20 |
+
text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
|
21 |
+
return text
|
22 |
+
|
23 |
+
|
24 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
25 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
26 |
+
|
27 |
+
import torch
|
28 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
29 |
+
print ("device ",device)
|
30 |
+
|
31 |
+
|
32 |
+
model_1 = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality").to(device)
|
33 |
+
tokenizer_1 = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
tokenizer_2 = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
|
39 |
+
model_2 = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device)
|
40 |
+
|
41 |
+
# Diverse Beam search
|
42 |
+
def my_paraphrase(sentence, model, tokenizer):
|
43 |
+
|
44 |
+
text = "paraphrase: "+sentence + " </s>"
|
45 |
+
encoding = tokenizer.encode_plus(text,max_length =60, padding=True, return_tensors="pt", truncation=True)
|
46 |
+
input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
|
47 |
+
|
48 |
+
model.eval()
|
49 |
+
diverse_beam_outputs = model.generate(
|
50 |
+
input_ids=input_ids,attention_mask=attention_mask,
|
51 |
+
max_length=60,
|
52 |
+
early_stopping=True,
|
53 |
+
num_beams=5,
|
54 |
+
num_beam_groups = 5,
|
55 |
+
num_return_sequences=5,
|
56 |
+
diversity_penalty = 0.70
|
57 |
+
)
|
58 |
+
sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
|
59 |
+
return sent
|
60 |
+
|
61 |
+
def return_output(file, models):
|
62 |
+
|
63 |
+
docs = file.name
|
64 |
+
sentence = clean_text(docs)
|
65 |
+
|
66 |
+
if models == 'T5':
|
67 |
+
model = model_1
|
68 |
+
tokenizer = tokenizer_1
|
69 |
+
|
70 |
+
elif models == 'Pegasus':
|
71 |
+
model = model_2
|
72 |
+
tokenizer = tokenizer_2
|
73 |
+
|
74 |
+
output = " ".join([my_paraphrase(sent, model, tokenizer) for sent in sent_tokenize(sentence)])
|
75 |
+
new_output = output.replace('paraphrasedoutput:', "")
|
76 |
+
new_output = new_output.replace('.<n>', '.\n')
|
77 |
+
return new_output
|
78 |
+
|
79 |
+
demo = gr.Interface(return_output, inputs=[gr.inputs.File(label="File", optional=False),
|
80 |
+
gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),],
|
81 |
+
outputs=[gr.outputs.Textbox(label="Summary")])
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
demo.launch(debug=True)
|