Spaces:
Sleeping
Sleeping
import gradio as gr | |
import random | |
import os | |
import copy | |
import torch | |
from huggingface_hub import login | |
from transformers import pipeline | |
from transformers import GPT2Tokenizer, GPT2LMHeadModel,set_seed | |
from transformers import AutoTokenizer, AutoModelWithLMHead,AutoModelForSeq2SeqLM | |
import datetime | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punctuation') | |
nltk.download('punkt') | |
from rake_nltk import Rake | |
login(os.environ["HF_TOKEN"]) | |
#https://huggingface.co/facebook/opt-1.3b | |
#generator = pipeline('text-generation', model="microsoft/DialoGPT-medium") | |
# dt stores the current date and time | |
dt = datetime.datetime.now() | |
print(dt) | |
print("loading models") | |
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium') | |
original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium') | |
untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500') | |
question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") | |
question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") | |
paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") | |
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws") | |
# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows") | |
# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows") | |
# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="G:\My Drive\Avatar\language_models_windows") | |
# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows") | |
# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows") | |
# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows") | |
# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows") | |
# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\language_models_windows") | |
# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows") | |
default_temperature=0.01 | |
default_seed=43 | |
def create_response_question_generation(input_str, max_length=64): | |
input_text = "answer: %s context: %s </s>" % (input_str, input_str) | |
print(f"create question input_text={input_text}") | |
features = question_generation_tokenizer([input_text], return_tensors='pt') | |
output = question_generation_model.generate(input_ids=features['input_ids'], | |
attention_mask=features['attention_mask'], | |
max_length=max_length) | |
return question_generation_tokenizer.decode(output[0]) | |
def create_response_paraphrase(input_str, max_length,num_return_sequences): | |
text = "paraphrase: " + input_str + " </s>" | |
encoding = paraphrase_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt") | |
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"] | |
num_return_sequences = int(num_return_sequences) # Ensure this is an integer | |
max_length = int(max_length) | |
outputs = paraphrase_model.generate( | |
input_ids=input_ids, attention_mask=attention_masks, | |
# max_length=256, | |
max_length=max_length, | |
do_sample=True, | |
top_k=120, | |
top_p=0.95, | |
early_stopping=True, | |
num_return_sequences=num_return_sequences, | |
repetition_penalty=1.5 | |
) | |
result_output_str="" | |
for output in outputs: | |
line = paraphrase_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
result_output_str=result_output_str+line+"<br/>" | |
# results.append(line) | |
# return results | |
return result_output_str | |
def create_response(input_str, | |
temperature, | |
seed, | |
model_name): | |
print("input_str="+input_str) | |
print("model_name="+str(model_name)) | |
print("temperature" + str(temperature)) | |
seed=int(seed) | |
print("seed" + str(seed)) | |
input_ids = tokenizer.encode(input_str + tokenizer.eos_token, return_tensors="pt") | |
#= encoded["input_ids"] | |
#attention_mask = encoded["attention_mask"] | |
if seed != -1: | |
set_seed(seed) | |
if model_name == "original_model": | |
output = original_model.generate(input_ids, max_length=100, temperature=temperature, pad_token_id=tokenizer.eos_token_id) | |
#elif model_name == "untethered_model": | |
else: | |
output = original_model.generate(input_ids, max_length=100, temperature=temperature, pad_token_id=tokenizer.eos_token_id ) | |
outputs = model_name+" generated <br>" | |
sentence = tokenizer.decode(output[0], skip_special_tokens=True) | |
outputs = outputs + sentence+ "<br/>" | |
return outputs | |
common_examples_string="<br/>Sample Inputs:<br/>What is death?<br/>One of the best teachers in all of life turns out to be what?<br/>what is your most meaningful relationship?<br/>What actually gives life meaning?<br/>" | |
interface_original = gr.Interface(fn=create_response_question_generation, | |
title="Question Generation", | |
description="Enter a statment like Paris is the captial of France", | |
inputs=[ | |
gr.Textbox(label="input text here", lines=3), | |
gr.Number( | |
label="max length", | |
value=64), | |
], | |
outputs="html" | |
) | |
interface_untethered_model = gr.Interface(fn=create_response_paraphrase, | |
title="Paraphrase", | |
description="Paraphrase sentences", | |
#examples=examples, | |
inputs=[ | |
gr.Textbox(label="input text here", lines=3, value="It is truly a great cosmic paradox that one of the best teachers in all of life turns out to be death. No person or situation could ever teach you as much as death has to teach you. "), | |
gr.Number( | |
label="max length", | |
value=512), | |
gr.Number( | |
label="num of responses", | |
value=2) | |
], | |
outputs="html" | |
) | |
demo = gr.TabbedInterface([interface_original, interface_untethered_model], ["Original", "Untethered"]) | |
demo.launch() |