File size: 5,323 Bytes
51ff9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python3
import argparse
import os
import subprocess

import pandas as pd
from termcolor import colored

parser = argparse.ArgumentParser(
    description='Compare two swe_bench output JSONL files and print the resolved diff'
)
parser.add_argument('input_file_1', type=str)
parser.add_argument('input_file_2', type=str)
parser.add_argument(
    '--show-paths',
    action='store_true',
    help='Show visualization paths for failed instances',
)
parser.add_argument(
    '--only-x-instances',
    action='store_true',
    help='Only show instances that are ran by X',
)
args = parser.parse_args()

df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)

if args.only_x_instances:
    instance_ids_1 = set(df1['instance_id'].tolist())
    print(
        f'Before removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
    )
    df2 = df2[df2['instance_id'].isin(instance_ids_1)]
    print(
        f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
    )


# Add summarization step for each input file
def summarize_file(file_path):
    script_dir = os.path.dirname(os.path.abspath(__file__))
    summarize_script = os.path.join(script_dir, 'summarize_outputs.py')

    print(f'\nSummary for {file_path}:')
    print('=' * 80)
    subprocess.run(['python', summarize_script, file_path], check=True)
    print('=' * 80)


# Generate summaries
summarize_file(args.input_file_1)
summarize_file(args.input_file_2)


# Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner')


def _get_resolved(report):
    if report is None:
        return False
    if isinstance(report, float):
        return False
    else:
        return report.get('resolved', False)


df['resolved_x'] = df['report_x'].apply(_get_resolved)
df['resolved_y'] = df['report_y'].apply(_get_resolved)
df['diff'] = df.apply(lambda x: x['resolved_x'] != x['resolved_y'], axis=1)

df_diff = df[df['diff']].sort_values(
    by=['resolved_x', 'resolved_y'], ascending=[False, False]
)
# skip if any of the resolved is nan, which means one of the eval is not finished yet
df_diff = df_diff[df_diff['resolved_x'].notna() & df_diff['resolved_y'].notna()]

print(f'X={args.input_file_1}')
print(f'Y={args.input_file_2}')
print(f'# diff={df_diff.shape[0]}')
df_diff = df_diff[['instance_id', 'resolved_x', 'resolved_y', 'report_x', 'report_y']]

# x resolved but y not
print('-' * 100)
df_diff_x_only = df_diff[df_diff['resolved_x'] & ~df_diff['resolved_y']].sort_values(
    by='instance_id'
)
print(f'# x resolved but y not={df_diff_x_only.shape[0]}')
print(df_diff_x_only[['instance_id', 'report_x', 'report_y']])

# y resolved but x not
print('-' * 100)
df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_values(
    by='instance_id'
)
print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
# get instance_id from df_diff_y_only

x_only_by_repo = {}
for instance_id in df_diff_x_only['instance_id'].tolist():
    repo = instance_id.split('__')[0]
    x_only_by_repo.setdefault(repo, []).append(instance_id)
y_only_by_repo = {}
for instance_id in df_diff_y_only['instance_id'].tolist():
    repo = instance_id.split('__')[0]
    y_only_by_repo.setdefault(repo, []).append(instance_id)

print('-' * 100)
print(
    colored('Repository comparison (x resolved vs y resolved):', 'cyan', attrs=['bold'])
)
all_repos = sorted(set(list(x_only_by_repo.keys()) + list(y_only_by_repo.keys())))

# Calculate diffs and sort repos by diff magnitude
repo_diffs = []
for repo in all_repos:
    x_count = len(x_only_by_repo.get(repo, []))
    y_count = len(y_only_by_repo.get(repo, []))
    diff = y_count - x_count
    repo_diffs.append((repo, diff))

# Sort by diff (descending) and then by repo name
repo_diffs.sort(key=lambda x: (-x[1], x[0]))
threshold = max(
    3, sum(d[1] for d in repo_diffs) / len(repo_diffs) * 1.5 if repo_diffs else 0
)

x_input_file_folder = os.path.join(os.path.dirname(args.input_file_1), 'output.viz')

for repo, diff in repo_diffs:
    x_instances = x_only_by_repo.get(repo, [])
    y_instances = y_only_by_repo.get(repo, [])

    # Determine if this repo has a significant diff
    is_significant = diff >= threshold
    repo_color = 'red' if is_significant else 'yellow'

    print(f'\n{colored(repo, repo_color, attrs=["bold"])}:')
    print(
        colored(
            f'Difference: {diff} instances! (Larger diff = Y better)',
            repo_color,
            attrs=['bold'],
        )
    )
    print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
    if x_instances:
        print('  ' + str(x_instances))
    print(colored(f'Y resolved but X failed: ({len(y_instances)} instances)', 'red'))
    if y_instances:
        print('  ' + str(y_instances))
        if args.show_paths:
            print(
                colored('    Visualization path for X failed:', 'cyan', attrs=['bold'])
            )
            for instance_id in y_instances:
                instance_file = os.path.join(
                    x_input_file_folder, f'false.{instance_id}.md'
                )
                print(f'    {instance_file}')