File size: 2,911 Bytes
6dc0c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Usage:
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4 --num-prompts 100
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model azure-gpt-4-32k --num-prompts 200
"""
import argparse
import pickle

import pandas as pd

from fastchat.llm_judge.common import (
    chat_completion_openai,
    chat_completion_openai_azure,
    chat_completion_anthropic,
)
from fastchat.conversation import get_conv_template


def truncate_string(s, l):
    half = int(l // 2)
    return s[:half] + s[-half:] if len(s) > l else s


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-file", type=str, required=True)
    parser.add_argument("--model", type=str, default="gpt-3.5-turbo")
    parser.add_argument("--num-prompts", type=int, default=100)
    args = parser.parse_args()

    model = args.model

    cluster_infos = pickle.load(open(args.input_file, "rb"))
    num_total_prompts = sum([x[0] for x in cluster_infos])

    topics = []
    percentages = []
    for i, info in enumerate(cluster_infos):
        num_samples, topk_prompts, random_prompts = info
        percentage = num_samples / num_total_prompts
        print(
            f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%"
        )
        instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific."
        split = int(args.num_prompts * 0.8)
        prompt = "\n".join(
            [truncate_string(x, l=200) for x in topk_prompts[:split]]
            + [
                truncate_string(x, l=200)
                for x in random_prompts[: args.num_prompts - split]
            ]
        )
        prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST."

        if "azure-" in model:
            template_name = "chatgpt"
            completion_func = chat_completion_openai_azure
        elif "gpt" in model:
            template_name = "chatgpt"
            completion_func = chat_completion_openai
        elif "claude" in model:
            template_name = "claude"
            completion_func = chat_completion_anthropic

        conv = get_conv_template(template_name)
        conv.set_system_message(instruct)
        conv.append_message(conv.roles[0], prompt)
        conv.append_message(conv.roles[1], None)

        topic = completion_func(model, conv, temperature=0, max_tokens=256)
        print(topic)

        topics.append(topic)
        percentages.append(round(percentage, 6))

    print()
    print(f"topics: {topics}")
    print(f"percentages: {percentages}")

    # save the informations
    df = pd.DataFrame()
    df["topic"] = topics
    df["percentage"] = percentages

    df.to_json(f"cluster_summary_{len(df)}.jsonl", lines=True, orient="records")