|
import argparse
|
|
import gzip
|
|
import json
|
|
import os
|
|
from glob import glob
|
|
|
|
from tqdm import tqdm
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
|
|
def load_completions(output_dir: str, instance_id: str):
|
|
glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
|
|
files = sorted(glob(glob_path))
|
|
|
|
try:
|
|
file_path = files[-1]
|
|
except IndexError:
|
|
|
|
return None
|
|
with open(file_path, 'r') as f:
|
|
result = json.load(f)
|
|
|
|
messages = result['messages']
|
|
messages.append(result['response']['choices'][0]['message'])
|
|
tools = result['kwargs']['tools']
|
|
return {
|
|
'messages': messages,
|
|
'tools': tools,
|
|
}
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('jsonl_path', type=str)
|
|
args = parser.parse_args()
|
|
|
|
output_dir = os.path.dirname(args.jsonl_path)
|
|
output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz')
|
|
|
|
|
|
needs_update = False
|
|
with open(args.jsonl_path, 'r') as f_in:
|
|
for line in tqdm(f_in, desc='Checking for changes'):
|
|
data = json.loads(line)
|
|
new_completions = load_completions(output_dir, data['instance_id'])
|
|
current_completions = data.get('raw_completions')
|
|
if current_completions != new_completions:
|
|
needs_update = True
|
|
break
|
|
|
|
if not needs_update:
|
|
print('No updates required. Skipping file update.')
|
|
exit(0)
|
|
|
|
if os.path.exists(output_path):
|
|
print(f'Output file already exists at {output_path}, overwriting? (y/n)')
|
|
if input() != 'y':
|
|
print('Exiting...')
|
|
exit(0)
|
|
|
|
|
|
with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out:
|
|
for line in tqdm(f_in):
|
|
data = json.loads(line)
|
|
data['raw_completions'] = load_completions(output_dir, data['instance_id'])
|
|
f_out.write(json.dumps(data) + '\n')
|
|
|
|
print(f'Saved compressed output to {output_path}')
|
|
|