In [1]:
import os

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# 1. Load raw data and convert to training data

In [None]:
import gzip
import json

from tqdm import tqdm

FILE_PATHS = [
 'YOURPATH-no-hint-train-t05-run_1/output.with_completions.jsonl.gz',
 'YOURPATH-no-hint-train-t05-run_2/output.with_completions.jsonl.gz',
]

# More memory efficient for large files
# Initialize lists to store the data
data = []


# Read file line by line
for FILE_PATH in FILE_PATHS:
 with gzip.open(FILE_PATH, 'rb') as f: # Use 'rb' for gzipped files
 for i, line in tqdm(
 enumerate(f), desc=f'Processing {FILE_PATH.split("/")[-1]}'
 ):
 # Parse only the fields we need
 raw_data = json.loads(line)
 data.append(
 {
 'resolved': raw_data['report']['resolved'],
 'messages': raw_data['raw_completions']['messages']
 if raw_data['raw_completions'] is not None
 else None,
 'git_patch': raw_data['test_result'].get('git_patch', ''),
 'tools': raw_data['raw_completions']['tools']
 if raw_data['raw_completions'] is not None
 and 'tools' in raw_data['raw_completions']
 else None,
 }
 )

# Convert to DataFrame after collecting all data
df = pd.DataFrame(data)
print(f'#total amount of data={len(df)}')
df = df[~df['messages'].isna()]
print(f'#total amount of data after removing nan={len(df)}')

## Filter

In [None]:
def _contains_multiple_tool_calls(messages: list[dict]) -> bool:
 return any(
 message.get('tool_calls') and len(message['tool_calls']) > 1
 for message in messages
 )


df['contains_multiple_tool_calls'] = df['messages'].apply(_contains_multiple_tool_calls)
display(df.groupby(['contains_multiple_tool_calls'])['resolved'].sum())

In [None]:
import copy

# Convert function calling messages to non-function calling messages
from openhands.llm.fn_call_converter import (
 FunctionCallConversionError,
 convert_fncall_messages_to_non_fncall_messages,
 convert_from_multiple_tool_calls_to_single_tool_call_messages,
)

total_failed = 0


def _convert_messages(messages: list[dict], tools: list[dict]) -> list[dict]:
 global total_failed
 message_copy = copy.deepcopy(messages)
 for message in message_copy:
 if message['content'] is None:
 message['content'] = ''
 try:
 return convert_fncall_messages_to_non_fncall_messages(
 message_copy, tools, add_in_context_learning_example=False
 )
 except FunctionCallConversionError:
 total_failed += 1
 # print(f'Failed to convert messages: {messages}\nTools: {tools}')
 # traceback.print_exc()
 return None


df['converted_messages'] = df.apply(
 lambda row: convert_from_multiple_tool_calls_to_single_tool_call_messages(
 row['messages'], ignore_final_tool_result=True
 ),
 axis=1,
)
df['nonfncall_messages'] = df.apply(
 lambda row: _convert_messages(row['converted_messages'], row['tools']), axis=1
)
print('total nan', df['nonfncall_messages'].isna().sum())
df = df[~df['nonfncall_messages'].isna()]
print(f'Total failed: {total_failed}')

## Tokenization

In [None]:
from pandarallel import pandarallel
from transformers import AutoTokenizer

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
pandarallel.initialize(progress_bar=True, verbose=1, nb_workers=16)
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-7B-Instruct')
df['n_tokens'] = df['rm_conv'].parallel_apply(
 lambda x: len(tokenizer.apply_chat_template(x))
)

In [None]:
print(f'BEFORE: #total={len(df)}')
df_selected = df[df['n_tokens'] < 131072]
print(f'AFTER(truncated to 128k): #total={len(df_selected)}')

In [None]:
df_selected['n_tokens'].describe()

In [None]:
# ecdf of n_tokens
import matplotlib.pyplot as plt
import seaborn as sns

display(df.groupby(['resolved'])['n_tokens'].describe())
sns.ecdfplot(x='n_tokens', data=df, hue='resolved')
plt.show()

print(f'#total={len(df)}')
df_selected = df[df['n_tokens'] < 131072]
print(f'#selected={len(df_selected)}')
display(df_selected.groupby(['resolved'])['n_tokens'].describe())
sns.ecdfplot(x='n_tokens', data=df_selected, hue='resolved')
plt.show()

In [None]:
df_selected[~df_selected['resolved']]['n_tokens'].describe()

In [None]:
df_selected['resolved'].value_counts()

In [None]:
df_selected.groupby(['resolved'])['n_tokens'].describe()

# Save Resolved Messages for SFT

In [None]:
df_selected[df_selected['resolved']][['nonfncall_messages']].rename(
 columns={'nonfncall_messages': 'messages'}
).to_json(
 os.path.join(
 'YOUR_OUTPUT_FOLDER',
 f'policy_traj_128k_swegym_{df_selected["resolved"].value_counts()[True]}i.jsonl',
 ),
 lines=True,
 orient='records',
)