File size: 8,893 Bytes
a1e5063
94aba93
fa94b9c
cde20ed
f9b7274
fa94b9c
94aba93
004631e
ec8b3ca
a939e85
ec8b3ca
 
 
 
 
d72eb57
fa94b9c
d72eb57
 
94aba93
5a521e1
a939e85
 
 
 
 
6465bf9
 
9b51486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d9386f
 
 
 
 
 
 
 
d72eb57
bee79a3
9b51486
 
 
 
 
 
 
 
 
 
 
e7b8dfa
 
 
 
 
87ff15f
 
e7b8dfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6465bf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d72eb57
6d9b69f
9b51486
6d9b69f
c6e7ac9
6d9b69f
004631e
9b51486
 
c6e7ac9
9c6e8e6
0ebfc8a
e90d0d7
c6e7ac9
20645d3
6d9b69f
e7b8dfa
 
f6f9adc
c6e7ac9
e7b8dfa
 
c6e7ac9
e7b8dfa
 
004631e
e7b8dfa
 
c6e7ac9
9c6e8e6
0ebfc8a
6465bf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ebfc8a
 
6465bf9
aa9d7ef
94aba93
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
import random
import os
import copy
import torch
from huggingface_hub import login
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel,set_seed
from transformers import AutoTokenizer, AutoModelWithLMHead,AutoModelForSeq2SeqLM
import datetime
import nltk
nltk.download('stopwords')
nltk.download('punctuation')
nltk.download('punkt')
from rake_nltk import Rake

login(os.environ["HF_TOKEN"])


#https://huggingface.co/facebook/opt-1.3b
#generator = pipeline('text-generation', model="microsoft/DialoGPT-medium")

# dt stores the current date and time
dt = datetime.datetime.now()
print(dt)
print("loading models")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')
untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500')
question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows")
# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows")
# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="G:\My Drive\Avatar\language_models_windows")
# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")

# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\language_models_windows")
# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
default_temperature=0.01
default_seed=43
def create_response_question_generation(input_str, max_length=64):
    input_text = "answer: %s  context: %s </s>" % (input_str, input_str)
    print(f"create question input_text={input_text}")
    features = question_generation_tokenizer([input_text], return_tensors='pt')

    output = question_generation_model.generate(input_ids=features['input_ids'],
                                                attention_mask=features['attention_mask'],
                                                max_length=max_length)

    return question_generation_tokenizer.decode(output[0])

def create_response_paraphrase(input_str, max_length,num_return_sequences):
    text = "paraphrase: " + input_str + " </s>"

    encoding = paraphrase_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
    num_return_sequences = int(num_return_sequences)  # Ensure this is an integer
    max_length = int(max_length)
    outputs = paraphrase_model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        # max_length=256,
        max_length=max_length,
        do_sample=True,
        top_k=120,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=num_return_sequences,
        repetition_penalty=1.5

    )
    result_output_str=""
    for output in outputs:
        line = paraphrase_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        result_output_str=result_output_str+line+"<br/>"
    #     results.append(line)
    # return results
    return result_output_str
import string
def contains_digit_or_punctuation(s):
    return any(char.isdigit() or char in string.punctuation for char in s)
rake = Rake()
def create_response_keywords_extraction(input_str):
    rake.extract_keywords_from_text(input_str)
    keywords_with_scores = rake.get_ranked_phrases_with_scores()
    filtered_keywords = []
    seen_keywords = set()
    for score, keyword in keywords_with_scores:
        # Apply filters: score must be greater than 1, keyword must not contain digits or punctuation
        if score > 1 and not contains_digit_or_punctuation(keyword) and keyword not in seen_keywords:
            filtered_keywords.append((score, keyword))
            seen_keywords.add(keyword)
    output_string=""
    for score, keyword in filtered_keywords:
        #print(f"Score: {score}, Keyword: {keyword}")
        output_string= output_string + f"Score: {score}, Keyword: {keyword} <br/>"

    return output_string

def create_response_intention_classification(input_str):
    labels = ["dialogue", "long content generation"]

    # Perform classification
    output_string=""
    result = classifier(input_str, labels)
    for label, score in zip(result["labels"], result["scores"]):
        output_string= output_string + f"Label: {label}, Score: {score:.4f} <br/>"

    return output_string

interface_question_generation = gr.Interface(fn=create_response_question_generation,
    title="Question Generation",
    description="Enter a statmente like Paris is the capital of France",
    inputs=[
    gr.Textbox(label="input text here", lines=3, value="Paris is the capital of France"),
        gr.Number(
            label="max length",
                value=64),
    ],
    outputs="html"
    )



interface_paraphrase = gr.Interface(fn=create_response_paraphrase,
    title="Paraphrase",
    description="Paraphrase sentences",
    #examples=examples,
    inputs=[
   gr.Textbox(label="input text here", lines=3, value="It is truly a great cosmic paradox that one of the best teachers in all of life turns out to be death. No person or situation could ever teach you as much as death has to teach you. "),

        gr.Number(
            label="max length",
                value=512),
        gr.Number(
            label="num of responses",
            value=2)
    ],
    outputs="html"
    )
interface_extract_keywords = gr.Interface(fn=create_response_keywords_extraction,
    title="Extract Keywords",
    description="Extract Keywords ",
    #examples=examples,
    inputs=[
   gr.Textbox(label="input text here", lines=3, value="It is truly a great cosmic paradox that one of the best teachers in all of life turns out to be death. No person or situation could ever teach you as much as death has to teach you. "),
    ],
    outputs="html"
    )
interface_intention_classification = gr.Interface(fn=create_response_intention_classification,
    title="Intention Classification",
    description="Find if question intention is short dialog or long content generation. How are you? versus WWhat are the implications of quantum computing on global security?  (difference not very dramatic as of now)",
    #examples=examples,
    inputs=[
   gr.Textbox(label="input text here", lines=3, value="What are the implications of quantum computing on global security?"),
    ],
    outputs="html"
    )


demo = gr.TabbedInterface([interface_question_generation, interface_paraphrase,interface_extract_keywords,interface_intention_classification], ["Question Generation", "Paraphrase", "Keywords Extraction", "Intention Classification"])

demo.launch()