|
import argparse |
|
import json |
|
import pandas as pd |
|
import os |
|
import re |
|
import ast |
|
import time |
|
import concurrent.futures |
|
import tqdm |
|
import random |
|
import threading |
|
|
|
LOCK = threading.RLock() |
|
|
|
|
|
SYSTEM_PROMPT = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice." |
|
|
|
ENDPOINT_INFO = { |
|
"model_name": "META-LLAMA/LLAMA-3-70B-CHAT-HF", |
|
"name": "llama-3-70b-instruct", |
|
"endpoints": [{"api_base": "-", "api_key": "-"}], |
|
"parallel": 8, |
|
"temperature": 0.0, |
|
"max_token": 512, |
|
} |
|
|
|
TAGS = { |
|
1: "specificity", |
|
2: "domain_knowledge", |
|
3: "complexity", |
|
4: "problem_solving", |
|
5: "creativity", |
|
6: "technical_accuracy", |
|
7: "real_world", |
|
} |
|
|
|
|
|
API_MAX_RETRY = 3 |
|
API_RETRY_SLEEP = 10 |
|
API_ERROR_OUTPUT = "$ERROR$" |
|
|
|
|
|
def get_endpoint(endpoint_list): |
|
if endpoint_list is None: |
|
return None |
|
assert endpoint_list is not None |
|
|
|
api_dict = random.choices(endpoint_list)[0] |
|
return api_dict |
|
|
|
|
|
pattern = re.compile(r"(\[\d(?:\,\s\d)*\])") |
|
|
|
|
|
def get_score(judgment): |
|
matches = pattern.findall(judgment) |
|
matches = [m for m in matches if m != ""] |
|
if len(set(matches)) == 0: |
|
return [] |
|
elif len(set(matches)) == 1: |
|
try: |
|
return ast.literal_eval(matches[0]) |
|
except SyntaxError: |
|
print(matches[0]) |
|
return [] |
|
else: |
|
return [] |
|
|
|
|
|
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None): |
|
import openai |
|
|
|
if api_dict: |
|
client = openai.OpenAI( |
|
base_url=api_dict["api_base"], |
|
api_key=api_dict["api_key"], |
|
) |
|
else: |
|
client = openai.OpenAI() |
|
|
|
output = API_ERROR_OUTPUT |
|
for _ in range(API_MAX_RETRY): |
|
try: |
|
|
|
completion = client.chat.completions.create( |
|
model=model, |
|
messages=messages, |
|
temperature=temperature, |
|
max_tokens=max_tokens, |
|
|
|
) |
|
output = completion.choices[0].message.content |
|
break |
|
except openai.RateLimitError as e: |
|
print(type(e), e) |
|
time.sleep(API_RETRY_SLEEP) |
|
except openai.BadRequestError as e: |
|
print(messages) |
|
print(type(e), e) |
|
break |
|
except openai.APIConnectionError as e: |
|
print(messages) |
|
print(type(e), e) |
|
time.sleep(API_RETRY_SLEEP) |
|
except openai.InternalServerError as e: |
|
print(messages) |
|
print(type(e), e) |
|
time.sleep(1) |
|
except KeyError: |
|
print(type(e), e) |
|
break |
|
|
|
return output |
|
|
|
|
|
def get_answer( |
|
question: dict, |
|
max_tokens: int, |
|
temperature: float, |
|
answer_file: str, |
|
api_dict: dict, |
|
): |
|
conv = [] |
|
conv.append({"role": "system", "content": SYSTEM_PROMPT}) |
|
|
|
conv.append({"role": "user", "content": question["prompt"]}) |
|
output = chat_completion_openai( |
|
model=ENDPOINT_INFO["model_name"], |
|
messages=conv, |
|
temperature=temperature, |
|
max_tokens=max_tokens, |
|
api_dict=api_dict, |
|
) |
|
|
|
criteria = get_score(output) |
|
|
|
|
|
question["criteria_tag"] = {name: bool(i in criteria) for i, name in TAGS.items()} |
|
question.drop("prompt") |
|
|
|
with LOCK: |
|
with open(answer_file, "a") as fout: |
|
fout.write(json.dumps(question.to_dict()) + "\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--input-file", type=str, required=True) |
|
parser.add_argument("--cache-file", type=str, default=None) |
|
parser.add_argument("--output-file", type=str, required=True) |
|
parser.add_argument("--convert-to-json", action="store_true") |
|
args = parser.parse_args() |
|
|
|
print("loading input data (might take min)") |
|
input_data = pd.read_json(args.input_file) |
|
print(f"{len(input_data)}# of input data just loaded") |
|
if args.cache_file: |
|
print("loading cache data") |
|
cache_data = pd.read_json(args.cache_file) |
|
print(f"{len(cache_data)}# of cache data just loaded") |
|
|
|
assert "criteria_tag" in cache_data.columns and len( |
|
cache_data["criteria_tag"].dropna() |
|
) == len(cache_data) |
|
|
|
not_labeled = input_data[ |
|
~input_data["question_id"].isin(cache_data["question_id"]) |
|
].copy() |
|
else: |
|
not_labeled = input_data.copy() |
|
|
|
if os.path.isfile(args.output_file): |
|
print("loading existing output") |
|
output_data = pd.read_json(args.output_file, lines=True) |
|
print(f"{len(output_data)}# of existing output just loaded") |
|
|
|
assert "criteria_tag" in output_data.columns and len( |
|
output_data["criteria_tag"].dropna() |
|
) == len(output_data) |
|
|
|
not_labeled = not_labeled[ |
|
~not_labeled["question_id"].isin(output_data["question_id"]) |
|
] |
|
|
|
print(f"{len(not_labeled)} needs to be labeled") |
|
|
|
not_labeled["prompt"] = not_labeled.conversation_a.map( |
|
lambda convo: "\n".join([convo[i]["content"] for i in range(0, len(convo), 2)]) |
|
) |
|
|
|
with concurrent.futures.ThreadPoolExecutor( |
|
max_workers=ENDPOINT_INFO["parallel"] |
|
) as executor: |
|
futures = [] |
|
for index, row in tqdm.tqdm(not_labeled.iterrows()): |
|
future = executor.submit( |
|
get_answer, |
|
row, |
|
ENDPOINT_INFO["max_token"], |
|
ENDPOINT_INFO["temperature"], |
|
args.output_file, |
|
get_endpoint(ENDPOINT_INFO["endpoints"]), |
|
) |
|
futures.append(future) |
|
for future in tqdm.tqdm( |
|
concurrent.futures.as_completed(futures), total=len(futures) |
|
): |
|
future.result() |
|
|
|
if args.convert_to_json: |
|
temp = pd.read_json(args.output_file, lines=True) |
|
temp.to_json( |
|
args.output_file[:-1], orient="records", indent=4, force_ascii=False |
|
) |
|
|