|
import datetime |
|
import glob |
|
import json |
|
from collections import deque |
|
import tqdm |
|
|
|
|
|
def _serialize_json(data): |
|
|
|
return json.dumps(data, sort_keys=True, separators=(",", ":")).encode("utf-8") |
|
|
|
|
|
types = { |
|
"share", |
|
"chat", |
|
"flag", |
|
"bothbad_vote", |
|
"downvote", |
|
"leftvote", |
|
"rightvote", |
|
"upvote", |
|
"tievote", |
|
} |
|
|
|
chat_dict = {} |
|
cache_queue = deque() |
|
|
|
|
|
def process_record(r): |
|
ip = r.pop("ip", None) |
|
tstamp = r.pop("tstamp") |
|
mtype = r.pop("type") |
|
start = r.pop("start", None) |
|
finish = r.pop("finish", None) |
|
|
|
|
|
while len(cache_queue) > 100000: |
|
outdated = cache_queue.popleft() |
|
poped_item = chat_dict.pop(outdated["key"], None) |
|
if poped_item is None: |
|
|
|
print("Error: Key to GC does not exist.") |
|
|
|
assert mtype in types |
|
if mtype == "chat": |
|
key = _serialize_json(r["state"]) |
|
|
|
chat_dict[key] = { |
|
"timestamp": tstamp, |
|
"start": start, |
|
"finish": finish, |
|
"conv_id": r["state"]["conv_id"], |
|
} |
|
cache_queue.append({"key": key, "timestamp": tstamp}) |
|
elif mtype in ("leftvote", "rightvote", "bothbad_vote", "tievote"): |
|
left_key = _serialize_json(r["states"][0]) |
|
right_key = _serialize_json(r["states"][1]) |
|
if left_key not in chat_dict: |
|
|
|
print( |
|
f'WARNING: Cannot find vote context for conversation {r["states"][0]["conv_id"]}' |
|
) |
|
return |
|
if right_key not in chat_dict: |
|
print( |
|
f'WARNING: Cannot find vote context for conversation {r["states"][1]["conv_id"]}' |
|
) |
|
return |
|
vote_time_data = { |
|
"timestamp": tstamp, |
|
"type": mtype, |
|
"left": chat_dict[left_key], |
|
"right": chat_dict[right_key], |
|
"ip": ip, |
|
} |
|
return vote_time_data |
|
|
|
return None |
|
|
|
|
|
def process_file(infile: str, outfile: str): |
|
with open(infile) as f: |
|
records = [] |
|
for l in f.readlines(): |
|
l = l.strip() |
|
if l: |
|
try: |
|
r = json.loads(l) |
|
if r.get("tstamp") is not None: |
|
records.append(r) |
|
except Exception: |
|
pass |
|
|
|
records.sort(key=lambda x: x["tstamp"]) |
|
|
|
with open(outfile, "a") as outfile: |
|
for r in records: |
|
try: |
|
output = process_record(r) |
|
if output is not None: |
|
outfile.write(json.dumps(output) + "\n") |
|
except Exception as e: |
|
import traceback |
|
|
|
print("Error:", e) |
|
traceback.print_exc() |
|
|
|
|
|
today = datetime.datetime.today().isoformat().split("T", 1)[0] |
|
|
|
filelist = sorted(glob.glob("/mnt/disks/data/fastchat_logs/server*/202*-*-*-conv.json")) |
|
filelist = [ |
|
f for f in filelist if today not in f |
|
] |
|
|
|
|
|
filelist = [f for f in filelist if "2024-03-" in f] |
|
|
|
for f in tqdm.tqdm(filelist): |
|
process_file(f, "output.jsonl") |
|
|