File size: 2,744 Bytes
a2e9054
 
 
 
 
 
 
 
 
 
 
 
 
 
8032e6a
a2e9054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94882e0
a2e9054
94882e0
a2e9054
 
 
 
 
 
 
 
 
 
 
94882e0
a2e9054
 
 
 
 
d22e3c2
a2e9054
 
 
 
 
 
 
 
 
 
 
a832dbc
a2e9054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
036b597
a2e9054
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
import re
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def read_in_text(url):
  with open(url, 'r') as file:
    article = file.read()
    return article
    
def clean_text(url):
  text = url
  #converting the text to all lower case
  text = text.lower()

  #removing the dates, time and name of author
  text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
  return text
  
  
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print ("device ",device)



tokenizer_1 = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
model_1 = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")




tokenizer_2 = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
model_2 = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device)

# Diverse Beam search
def my_paraphrase(sentence, model, tokenizer):

  text = "paraphrase: "+sentence + " </s>"
  encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True)
  input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  model.eval()
  diverse_beam_outputs = model.generate(
    input_ids=input_ids,attention_mask=attention_mask,
    max_length = 512,
    early_stopping=True,
    num_beams=5,
    num_beam_groups = 5,
    num_return_sequences=5,
    diversity_penalty = 0.70
  )
  sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
  return sent
  
def return_output(file, models):

  docs = file
  sentence = clean_text(docs)

  if models == 'T5':
    model = model_1
    tokenizer = tokenizer_1
    
  elif models == 'Pegasus':
    model = model_2
    tokenizer = tokenizer_2

  output = " ".join([my_paraphrase(sent, model, tokenizer) for sent in sent_tokenize(sentence)])
  new_output = output.replace('paraphrasedoutput:', "")
  new_output = new_output.replace('.<n>', '.\n')
  return new_output
  
demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False),
                                           gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),],
                                                 outputs=[gr.outputs.Textbox(label="Summary")])
                                                 
if __name__ == "__main__":
    demo.launch(debug=True)