import os | |
import os | |
from pathlib import Path | |
import csv | |
import json | |
import openai | |
import time | |
import pandas as pd | |
# Set up the OpenAI API client | |
api_key = "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" | |
openai.api_key = api_key | |
# Set up the chatGPT model and prompt | |
model_engine = "text-davinci-003" | |
import gradio as gr | |
import time | |
import argparse | |
from vllm import LLM, SamplingParams | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model", type=str) # model path | |
parser.add_argument("--n_gpu", type=int, default=1) # n_gpu | |
return parser.parse_args() | |
def echo(message, history, system_prompt, temperature, max_tokens): | |
response = f"System prompt: {system_prompt}\n Message: {message}. \n Temperature: {temperature}. \n Max Tokens: {max_tokens}." | |
for i in range(min(len(response), int(max_tokens))): | |
time.sleep(0.05) | |
yield response[: i+1] | |
def align_data(data): | |
"""Given dict with lists, creates aligned strings | |
Adapted from Assignment 3 of CS224N | |
Args: | |
data: (dict) data["x"] = ["I", "love", "you"] | |
(dict) data["y"] = ["O", "O", "O"] | |
Returns: | |
data_aligned: (dict) data_align["x"] = "I love you" | |
data_align["y"] = "O O O " | |
""" | |
spacings = [max([len(seq[i]) for seq in data.values()]) | |
for i in range(len(data[list(data.keys())[0]]))] | |
data_aligned = dict() | |
# for each entry, create aligned string | |
for key, seq in data.items(): | |
str_aligned = "" | |
for token, spacing in zip(seq, spacings): | |
str_aligned += token + " " * (spacing - len(token) + 1) | |
data_aligned[key] = str_aligned | |
return data_aligned | |
def get_llm_result(input_data, input_domain): | |
# data is file path of topic result | |
ori_caption = input_data | |
# replace the static path as your azcopy target folder like: "C:\Users\zhengkai\PycharmProjects\pythonProject\sync_data" | |
# topic_file_path = "C:\\Users\zhengkai\PycharmProjects\pythonProject\sync_data\PreprocessData\\" + str(ori_caption) + "\step10_cook_json_file" | |
# prompt = ( | |
# f"I want you to act as an Science Question Answering asker, ask in a Science Question style. I will speak to you \ | |
# use a caption of an image you will mining the probable Science question and improved version of the problem in \ | |
# Science Question style, in English. Keep the meaning same, but make them more science. I want you to only reply \ | |
# the question and nothing else, do not write explanations. My first caption sentence is \"{ori_caption}\"" | |
# # f"Tell me which scenarios in creator tool could improvement by creators of MSN through leverage ChatGPT." | |
# ) | |
# prompt = f"{ori_caption},以这个句子为标题写一篇不少于1000字的{input_domain}专家风格的文章。每个段落多加一些细节和故事,增加文章的可读性。" | |
prompt = "" | |
def predict(message, history, system_prompt, temperature, max_tokens): | |
model_path = "/workspaceblobstore/caxu/trained_models/13Bv2_497kcontinueroleplay_dsys_2048_e4_2e_5/checkpoint-75" | |
llm = LLM(model=model_path, tensor_parallel_size=1) | |
instruction = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. " | |
for human, assistant in history: | |
instruction += 'USER: '+ human + ' ASSISTANT: '+ assistant + '</s>' | |
instruction += 'USER: '+ message + ' ASSISTANT:' | |
problem = [instruction] | |
stop_tokens = ["Question:", "Question", "USER:", "USER", "ASSISTANT:", "ASSISTANT", "Instruction:", "Instruction", "Response:", "Response"] | |
sampling_params = SamplingParams(temperature=temperature, top_p=1, max_tokens=max_tokens, stop=stop_tokens) | |
completions = llm.generate(problem, sampling_params) | |
for output in completions: | |
prompt = output.prompt | |
generated_text = output.outputs[0].text | |
return generated_text | |
# for idx in range(len(generated_text)): | |
# yield generated_text[:idx+1] | |
try: | |
# completion = openai.Completion.create( | |
# engine=model_engine, | |
# prompt=prompt, | |
# max_tokens=3000, | |
# n=1, | |
# stop=None, | |
# temperature=0.5, | |
# ) | |
# | |
# response = completion.choices[0].text | |
# shorten_response = response.replace("\n", "").strip() | |
# len_response = len(shorten_response.split(" ")) | |
# if len_response >= 3500: | |
# shorten_response = "".join(shorten_response.split(" ")[:3500]) | |
# print("X"*10) | |
# print(f"shorten_response is {shorten_response}") | |
# list_shorten = shorten_response.split(" ") | |
# print(list_shorten) | |
# print(f"length is {len(list_shorten)}") | |
# title_prompt = f"{shorten_response},给这个文章写一个头条号风格的标题。增加标题的吸引力,可读性。" | |
# title_completion = openai.Completion.create( | |
# engine=model_engine, | |
# prompt=title_prompt, | |
# max_tokens=200, | |
# n=1, | |
# stop=None, | |
# temperature=0.5, | |
# ) | |
# title_response = title_completion.choices[0].text | |
history = "" | |
prompt = "" | |
system_prompt = "" | |
response = predict(prompt, history, system_prompt, 0.5, 3000) | |
print(response) | |
# if not os.path.isdir(topic_file_path): | |
# print("File folder not exist") | |
# topic_result_file = "" | |
# topic_file_name_pattern = "step10_json_filestep9_merge_rewrite_" | |
# for filename in os.listdir(topic_file_path): | |
# if filename.startswith(topic_file_name_pattern): | |
# topic_result_file = os.path.join(topic_file_path, filename) | |
# | |
# data_aligned = dict() | |
# output_dir_name = "." | |
# output_dir = os.path.join(output_dir_name, "result_topic_file") | |
# Path(output_dir).mkdir(parents=True, exist_ok=True) | |
# write_file_name = "save_server_" + topic_file_path.split("\\")[-1] | |
# write_output_file_path = os.path.join(output_dir, write_file_name) | |
# | |
# with open(topic_result_file, encoding="utf8") as f: | |
# json_data = json.load(f) | |
# return json_data | |
return response, response | |
except Exception as ex: | |
print("File not exist") | |
raise ex | |
def get_topic_result(input_data, input_domain): | |
# data is file path of topic result | |
ori_caption = input_data | |
# replace the static path as your azcopy target folder like: "C:\Users\zhengkai\PycharmProjects\pythonProject\sync_data" | |
# topic_file_path = "C:\\Users\zhengkai\PycharmProjects\pythonProject\sync_data\PreprocessData\\" + str(ori_caption) + "\step10_cook_json_file" | |
# prompt = ( | |
# f"I want you to act as an Science Question Answering asker, ask in a Science Question style. I will speak to you \ | |
# use a caption of an image you will mining the probable Science question and improved version of the problem in \ | |
# Science Question style, in English. Keep the meaning same, but make them more science. I want you to only reply \ | |
# the question and nothing else, do not write explanations. My first caption sentence is \"{ori_caption}\"" | |
# # f"Tell me which scenarios in creator tool could improvement by creators of MSN through leverage ChatGPT." | |
# ) | |
prompt = f"{ori_caption},以这个句子为标题写一篇不少于1000字的{input_domain}专家风格的文章。每个段落多加一些细节和故事,增加文章的可读性。" | |
try: | |
completion = openai.Completion.create( | |
engine=model_engine, | |
prompt=prompt, | |
max_tokens=3000, | |
n=1, | |
stop=None, | |
temperature=0.5, | |
) | |
response = completion.choices[0].text | |
shorten_response = response.replace("\n", "").strip() | |
len_response = len(shorten_response.split(" ")) | |
if len_response >= 3500: | |
shorten_response = "".join(shorten_response.split(" ")[:3500]) | |
print("X"*10) | |
print(f"shorten_response is {shorten_response}") | |
list_shorten = shorten_response.split(" ") | |
print(list_shorten) | |
print(f"length is {len(list_shorten)}") | |
title_prompt = f"{shorten_response},给这个文章写一个头条号风格的标题。增加标题的吸引力,可读性。" | |
title_completion = openai.Completion.create( | |
engine=model_engine, | |
prompt=title_prompt, | |
max_tokens=200, | |
n=1, | |
stop=None, | |
temperature=0.5, | |
) | |
title_response = title_completion.choices[0].text | |
# print(response) | |
# if not os.path.isdir(topic_file_path): | |
# print("File folder not exist") | |
# topic_result_file = "" | |
# topic_file_name_pattern = "step10_json_filestep9_merge_rewrite_" | |
# for filename in os.listdir(topic_file_path): | |
# if filename.startswith(topic_file_name_pattern): | |
# topic_result_file = os.path.join(topic_file_path, filename) | |
# | |
# data_aligned = dict() | |
# output_dir_name = "." | |
# output_dir = os.path.join(output_dir_name, "result_topic_file") | |
# Path(output_dir).mkdir(parents=True, exist_ok=True) | |
# write_file_name = "save_server_" + topic_file_path.split("\\")[-1] | |
# write_output_file_path = os.path.join(output_dir, write_file_name) | |
# | |
# with open(topic_result_file, encoding="utf8") as f: | |
# json_data = json.load(f) | |
# return json_data | |
return response, title_response | |
except Exception as ex: | |
print("File not exist") | |
raise ex | |
def get_model_api(): | |
"""Returns lambda function for api""" | |
def model_api(input_title, input_domain): | |
""" | |
Args: | |
input_data: submitted to the API, raw string | |
Returns: | |
output_data: after some transformation, to be | |
returned to the API | |
""" | |
# print("X"*10) | |
# print(f"input_title is {input_title}") | |
# print(f"input_data2 is {input_domain}") | |
punc = [",", "?", ".", ":", ";", "!", "(", ")", "[", "]"] | |
# preds, title_preds = get_topic_result(input_title, input_domain) | |
preds, title_preds = get_llm_result(input_title, input_domain) | |
output_data = {"input_title": input_title, "output": preds, "title_output": title_preds} | |
return output_data | |
return model_api | |
# config = Config() | |
# model = NERModel(config) | |