abdulmatinomotoso commited on
Commit
a2e9054
1 Parent(s): 5742cf8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import gradio as gr
5
+ import nltk
6
+ from nltk.tokenize import sent_tokenize
7
+ nltk.download('punkt')
8
+
9
+ def read_in_text(url):
10
+ with open(url, 'r') as file:
11
+ article = file.read()
12
+ return article
13
+
14
+ def clean_text(url):
15
+ text = read_in_text(url)
16
+ #converting the text to all lower case
17
+ text = text.lower()
18
+
19
+ #removing the dates, time and name of author
20
+ text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
21
+ return text
22
+
23
+
24
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
25
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
26
+
27
+ import torch
28
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
29
+ print ("device ",device)
30
+
31
+
32
+ model_1 = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality").to(device)
33
+ tokenizer_1 = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
34
+
35
+
36
+
37
+
38
+ tokenizer_2 = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
39
+ model_2 = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device)
40
+
41
+ # Diverse Beam search
42
+ def my_paraphrase(sentence, model, tokenizer):
43
+
44
+ text = "paraphrase: "+sentence + " </s>"
45
+ encoding = tokenizer.encode_plus(text,max_length =60, padding=True, return_tensors="pt", truncation=True)
46
+ input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
47
+
48
+ model.eval()
49
+ diverse_beam_outputs = model.generate(
50
+ input_ids=input_ids,attention_mask=attention_mask,
51
+ max_length=60,
52
+ early_stopping=True,
53
+ num_beams=5,
54
+ num_beam_groups = 5,
55
+ num_return_sequences=5,
56
+ diversity_penalty = 0.70
57
+ )
58
+ sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
59
+ return sent
60
+
61
+ def return_output(file, models):
62
+
63
+ docs = file.name
64
+ sentence = clean_text(docs)
65
+
66
+ if models == 'T5':
67
+ model = model_1
68
+ tokenizer = tokenizer_1
69
+
70
+ elif models == 'Pegasus':
71
+ model = model_2
72
+ tokenizer = tokenizer_2
73
+
74
+ output = " ".join([my_paraphrase(sent, model, tokenizer) for sent in sent_tokenize(sentence)])
75
+ new_output = output.replace('paraphrasedoutput:', "")
76
+ new_output = new_output.replace('.<n>', '.\n')
77
+ return new_output
78
+
79
+ demo = gr.Interface(return_output, inputs=[gr.inputs.File(label="File", optional=False),
80
+ gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),],
81
+ outputs=[gr.outputs.Textbox(label="Summary")])
82
+
83
+ if __name__ == "__main__":
84
+ demo.launch(debug=True)