File size: 4,412 Bytes
246d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
import argparse
import os

import pandas as pd
from termcolor import colored

parser = argparse.ArgumentParser(
    description='Compare two swe_bench output JSONL files and print the resolved diff'
)
parser.add_argument('input_file_1', type=str)
parser.add_argument('input_file_2', type=str)
parser.add_argument(
    '--show-paths',
    action='store_true',
    help='Show visualization paths for failed instances',
)
args = parser.parse_args()

df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)


# Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner')


def _get_resolved(report):
    if report is None:
        return False
    if isinstance(report, float):
        return False
    else:
        return report.get('resolved', False)


df['resolved_x'] = df['report_x'].apply(_get_resolved)
df['resolved_y'] = df['report_y'].apply(_get_resolved)
df['diff'] = df.apply(lambda x: x['resolved_x'] != x['resolved_y'], axis=1)

df_diff = df[df['diff']].sort_values(
    by=['resolved_x', 'resolved_y'], ascending=[False, False]
)
# skip if any of the resolved is nan, which means one of the eval is not finished yet
df_diff = df_diff[df_diff['resolved_x'].notna() & df_diff['resolved_y'].notna()]

print(f'X={args.input_file_1}')
print(f'Y={args.input_file_2}')
print(f'# diff={df_diff.shape[0]}')
df_diff = df_diff[['instance_id', 'resolved_x', 'resolved_y', 'report_x', 'report_y']]

# x resolved but y not
print('-' * 100)
df_diff_x_only = df_diff[df_diff['resolved_x'] & ~df_diff['resolved_y']].sort_values(
    by='instance_id'
)
print(f'# x resolved but y not={df_diff_x_only.shape[0]}')
print(df_diff_x_only[['instance_id', 'report_x', 'report_y']])

# y resolved but x not
print('-' * 100)
df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_values(
    by='instance_id'
)
print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
# get instance_id from df_diff_y_only

x_only_by_repo = {}
for instance_id in df_diff_x_only['instance_id'].tolist():
    repo = instance_id.split('__')[0]
    x_only_by_repo.setdefault(repo, []).append(instance_id)
y_only_by_repo = {}
for instance_id in df_diff_y_only['instance_id'].tolist():
    repo = instance_id.split('__')[0]
    y_only_by_repo.setdefault(repo, []).append(instance_id)

print('-' * 100)
print(
    colored('Repository comparison (x resolved vs y resolved):', 'cyan', attrs=['bold'])
)
all_repos = sorted(set(list(x_only_by_repo.keys()) + list(y_only_by_repo.keys())))

# Calculate diffs and sort repos by diff magnitude
repo_diffs = []
for repo in all_repos:
    x_count = len(x_only_by_repo.get(repo, []))
    y_count = len(y_only_by_repo.get(repo, []))
    diff = abs(x_count - y_count)
    repo_diffs.append((repo, diff))

# Sort by diff (descending) and then by repo name
repo_diffs.sort(key=lambda x: (-x[1], x[0]))
threshold = max(
    3, sum(d[1] for d in repo_diffs) / len(repo_diffs) * 1.5 if repo_diffs else 0
)

x_input_file_folder = os.path.join(os.path.dirname(args.input_file_1), 'output.viz')

for repo, diff in repo_diffs:
    x_instances = x_only_by_repo.get(repo, [])
    y_instances = y_only_by_repo.get(repo, [])

    # Determine if this repo has a significant diff
    is_significant = diff >= threshold
    repo_color = 'red' if is_significant else 'yellow'

    print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
    print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
    if x_instances:
        print('  ' + str(x_instances))
    print(colored(f'Y resolved but X failed: ({len(y_instances)} instances)', 'red'))
    if y_instances:
        print('  ' + str(y_instances))
        if args.show_paths:
            print(
                colored('    Visualization path for X failed:', 'cyan', attrs=['bold'])
            )
            for instance_id in y_instances:
                instance_file = os.path.join(
                    x_input_file_folder, f'false.{instance_id}.md'
                )
                print(f'    {instance_file}')