|
import argparse
|
|
|
|
import pandas as pd
|
|
from datasets import load_dataset
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('output_filepath', type=str, help='Path to save the output file')
|
|
parser.add_argument(
|
|
'--dataset_name',
|
|
type=str,
|
|
help='Name of the dataset to download',
|
|
default='princeton-nlp/SWE-bench_Lite',
|
|
)
|
|
parser.add_argument('--split', type=str, help='Split to download', default='test')
|
|
args = parser.parse_args()
|
|
|
|
dataset = load_dataset(args.dataset_name, split=args.split)
|
|
output_filepath = args.output_filepath
|
|
print(
|
|
f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
|
|
)
|
|
patches = [
|
|
{'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
|
|
]
|
|
print(f'{len(patches)} gold patches loaded')
|
|
pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
|
|
print(f'Patches saved to {output_filepath}')
|
|
|