|
""" |
|
Usage: |
|
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4 --num-prompts 100 |
|
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model azure-gpt-4-32k --num-prompts 200 |
|
""" |
|
import argparse |
|
import pickle |
|
|
|
import pandas as pd |
|
|
|
from fastchat.llm_judge.common import ( |
|
chat_completion_openai, |
|
chat_completion_openai_azure, |
|
chat_completion_anthropic, |
|
) |
|
from fastchat.conversation import get_conv_template |
|
|
|
|
|
def truncate_string(s, l): |
|
half = int(l // 2) |
|
return s[:half] + s[-half:] if len(s) > l else s |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--input-file", type=str, required=True) |
|
parser.add_argument("--model", type=str, default="gpt-3.5-turbo") |
|
parser.add_argument("--num-prompts", type=int, default=100) |
|
args = parser.parse_args() |
|
|
|
model = args.model |
|
|
|
cluster_infos = pickle.load(open(args.input_file, "rb")) |
|
num_total_prompts = sum([x[0] for x in cluster_infos]) |
|
|
|
topics = [] |
|
percentages = [] |
|
for i, info in enumerate(cluster_infos): |
|
num_samples, topk_prompts, random_prompts = info |
|
percentage = num_samples / num_total_prompts |
|
print( |
|
f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%" |
|
) |
|
instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific." |
|
split = int(args.num_prompts * 0.8) |
|
prompt = "\n".join( |
|
[truncate_string(x, l=200) for x in topk_prompts[:split]] |
|
+ [ |
|
truncate_string(x, l=200) |
|
for x in random_prompts[: args.num_prompts - split] |
|
] |
|
) |
|
prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST." |
|
|
|
if "azure-" in model: |
|
template_name = "chatgpt" |
|
completion_func = chat_completion_openai_azure |
|
elif "gpt" in model: |
|
template_name = "chatgpt" |
|
completion_func = chat_completion_openai |
|
elif "claude" in model: |
|
template_name = "claude" |
|
completion_func = chat_completion_anthropic |
|
|
|
conv = get_conv_template(template_name) |
|
conv.set_system_message(instruct) |
|
conv.append_message(conv.roles[0], prompt) |
|
conv.append_message(conv.roles[1], None) |
|
|
|
topic = completion_func(model, conv, temperature=0, max_tokens=256) |
|
print(topic) |
|
|
|
topics.append(topic) |
|
percentages.append(round(percentage, 6)) |
|
|
|
print() |
|
print(f"topics: {topics}") |
|
print(f"percentages: {percentages}") |
|
|
|
|
|
df = pd.DataFrame() |
|
df["topic"] = topics |
|
df["percentage"] = percentages |
|
|
|
df.to_json(f"cluster_summary_{len(df)}.jsonl", lines=True, orient="records") |
|
|