File size: 2,655 Bytes
6dc0c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import json
import pandas as pd
import ast

import matplotlib.pyplot as plt
from matplotlib import rcParams

import argparse
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import numpy as np

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_dir", type=str, default="output")
    parser.add_argument("--model", type=str, default=None)
    parser.add_argument("--input_file", type=str, required=True)
    parser.add_argument("--percentile", type=float, default=0.9999)
    args = parser.parse_args()
    output_dir = args.output_dir
    input_file = args.input_file

    with open(input_file) as f:
        data = json.load(f)

    os.makedirs(output_dir, exist_ok=True)

    # Preprocessing
    all_convs_new = []
    convs = []
    for row in data:
        conv = ""
        for turns in row["conversation_a"]:
            if turns["role"] == "user":
                conv += f"{turns['content']}\n"

        convs.append(conv[:10000])
        row["post_process_conv"] = conv[:10000]
        all_convs_new.append(row)

    df = pd.DataFrame(all_convs_new)
    print("Number of conversations: ", len(df))

    prompt_counts = df["post_process_conv"].value_counts()
    # Select the top 20 most frequent prompts
    top_prompts = prompt_counts.head(20)
    print(top_prompts)

    # Determine the percentile count
    percentile_cutoff = prompt_counts.quantile(args.percentile)
    print(f"{args.percentile*100} percentile count: {percentile_cutoff}")

    # prompts that are more common than the percentile cutoff
    high_frequency_prompts = prompt_counts[prompt_counts > percentile_cutoff].index
    print(
        f"Number of high frequency prompts: {len(high_frequency_prompts)}/{len(prompt_counts)}"
    )

    # initialize a new column dedup_tag
    dedup_tags = np.array(
        [{"high_freq": False, "sampled": True} for _ in range(len(df))]
    )
    high_freq_groups = df.groupby("post_process_conv")
    for prompt in tqdm(high_frequency_prompts):
        df_high_freq = high_freq_groups.get_group(prompt)
        sampled_indices = df_high_freq.sample(
            n=int(percentile_cutoff), random_state=42
        ).index
        dedup_tags[df_high_freq.index] = {"high_freq": True, "sampled": False}
        dedup_tags[sampled_indices] = {"high_freq": True, "sampled": True}

    df["dedup_tag"] = dedup_tags

    # drop intermediate columns (post_process_conv)
    df = df.drop(columns=["post_process_conv"])

    df.to_json(
        os.path.join(output_dir, "dedup.json"),
        orient="records",
        indent=4,
        force_ascii=False,
    )