Spaces:

martinakaduc
/

melt

Running

File size: 3,802 Bytes

f3305db

import datetime
import glob
import json
from collections import deque
import tqdm


def _serialize_json(data):
    # Serialize JSON with sorted keys and no whitespace
    return json.dumps(data, sort_keys=True, separators=(",", ":")).encode("utf-8")


types = {
    "share",
    "chat",
    "flag",
    "bothbad_vote",
    "downvote",
    "leftvote",
    "rightvote",
    "upvote",
    "tievote",
}

chat_dict = {}
cache_queue = deque()


def process_record(r):
    ip = r.pop("ip", None)
    tstamp = r.pop("tstamp")
    mtype = r.pop("type")
    start = r.pop("start", None)
    finish = r.pop("finish", None)

    # gabagge collect to save memory
    while len(cache_queue) > 100000:
        outdated = cache_queue.popleft()
        poped_item = chat_dict.pop(outdated["key"], None)
        if poped_item is None:
            # TODO: this sometimes happens, need to investigate what happens. in theory the chat dict should be synced with the queue, unless there are duplicated items
            print("Error: Key to GC does not exist.")

    assert mtype in types
    if mtype == "chat":
        key = _serialize_json(r["state"])
        # TODO: add the string length of the last reply for analyzing voting time per character.
        chat_dict[key] = {
            "timestamp": tstamp,
            "start": start,
            "finish": finish,
            "conv_id": r["state"]["conv_id"],
        }
        cache_queue.append({"key": key, "timestamp": tstamp})
    elif mtype in ("leftvote", "rightvote", "bothbad_vote", "tievote"):
        left_key = _serialize_json(r["states"][0])
        right_key = _serialize_json(r["states"][1])
        if left_key not in chat_dict:
            # TODO: this sometimes happens, it means we have the vote but we cannot find previous chat, need to investigate what happens
            print(
                f'WARNING: Cannot find vote context for conversation {r["states"][0]["conv_id"]}'
            )
            return
        if right_key not in chat_dict:
            print(
                f'WARNING: Cannot find vote context for conversation {r["states"][1]["conv_id"]}'
            )
            return
        vote_time_data = {
            "timestamp": tstamp,
            "type": mtype,
            "left": chat_dict[left_key],
            "right": chat_dict[right_key],
            "ip": ip,
        }
        return vote_time_data

    return None


def process_file(infile: str, outfile: str):
    with open(infile) as f:
        records = []
        for l in f.readlines():
            l = l.strip()
            if l:
                try:
                    r = json.loads(l)
                    if r.get("tstamp") is not None:
                        records.append(r)
                except Exception:
                    pass
        # sort the record in case there are out-of-order records
        records.sort(key=lambda x: x["tstamp"])

        with open(outfile, "a") as outfile:
            for r in records:
                try:
                    output = process_record(r)
                    if output is not None:
                        outfile.write(json.dumps(output) + "\n")
                except Exception as e:
                    import traceback

                    print("Error:", e)
                    traceback.print_exc()


today = datetime.datetime.today().isoformat().split("T", 1)[0]
# sort it to make sure the date is continuous for each server
filelist = sorted(glob.glob("/mnt/disks/data/fastchat_logs/server*/202*-*-*-conv.json"))
filelist = [
    f for f in filelist if today not in f
]  # skip today because date could be partial

# TODO: change this to select different range of data
filelist = [f for f in filelist if "2024-03-" in f]

for f in tqdm.tqdm(filelist):
    process_file(f, "output.jsonl")