File size: 2,911 Bytes
6dc0c9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
"""
Usage:
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4 --num-prompts 100
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model azure-gpt-4-32k --num-prompts 200
"""
import argparse
import pickle
import pandas as pd
from fastchat.llm_judge.common import (
chat_completion_openai,
chat_completion_openai_azure,
chat_completion_anthropic,
)
from fastchat.conversation import get_conv_template
def truncate_string(s, l):
half = int(l // 2)
return s[:half] + s[-half:] if len(s) > l else s
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", type=str, required=True)
parser.add_argument("--model", type=str, default="gpt-3.5-turbo")
parser.add_argument("--num-prompts", type=int, default=100)
args = parser.parse_args()
model = args.model
cluster_infos = pickle.load(open(args.input_file, "rb"))
num_total_prompts = sum([x[0] for x in cluster_infos])
topics = []
percentages = []
for i, info in enumerate(cluster_infos):
num_samples, topk_prompts, random_prompts = info
percentage = num_samples / num_total_prompts
print(
f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%"
)
instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific."
split = int(args.num_prompts * 0.8)
prompt = "\n".join(
[truncate_string(x, l=200) for x in topk_prompts[:split]]
+ [
truncate_string(x, l=200)
for x in random_prompts[: args.num_prompts - split]
]
)
prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST."
if "azure-" in model:
template_name = "chatgpt"
completion_func = chat_completion_openai_azure
elif "gpt" in model:
template_name = "chatgpt"
completion_func = chat_completion_openai
elif "claude" in model:
template_name = "claude"
completion_func = chat_completion_anthropic
conv = get_conv_template(template_name)
conv.set_system_message(instruct)
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
topic = completion_func(model, conv, temperature=0, max_tokens=256)
print(topic)
topics.append(topic)
percentages.append(round(percentage, 6))
print()
print(f"topics: {topics}")
print(f"percentages: {percentages}")
# save the informations
df = pd.DataFrame()
df["topic"] = topics
df["percentage"] = percentages
df.to_json(f"cluster_summary_{len(df)}.jsonl", lines=True, orient="records")
|