Spaces:
Build error
Build error
import json | |
import os | |
import pprint | |
import tqdm | |
from openhands.core.config import get_llm_config_arg, get_parser, load_openhands_config | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.llm.llm import LLM | |
config = load_openhands_config() | |
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: | |
passed = [] | |
failed = [] | |
costs = [] | |
instance_ids = set() | |
instances = [] | |
with open(res_file_path, 'r') as file: | |
for line in file: | |
data = json.loads(line.strip()) | |
success = data['metrics']['success'] | |
if data['instance_id'] in instance_ids: | |
print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}') | |
continue | |
instance_ids.add(data['instance_id']) | |
instances.append(data) | |
if success: | |
passed.append( | |
{ | |
'instance_id': data['instance_id'], | |
'repo': data['repo'], | |
'instruction': data['instruction'], | |
'eval_script': data['eval_script'], | |
'eval_exit_code': data['eval_exit_code'], | |
'eval_output': data['eval_output'], | |
'accumulated_cost': data['metrics']['accumulated_cost'], | |
} | |
) | |
else: | |
failed.append( | |
{ | |
'instance_id': data['instance_id'], | |
'repo': data['repo'], | |
'instruction': data['instruction'], | |
'metadata': data['metadata'], | |
'history': data['history'], | |
'eval_script': data['eval_script'], | |
'eval_exit_code': data['eval_exit_code'], | |
'eval_output': data['eval_output'], | |
'accumulated_cost': data['metrics']['accumulated_cost'], | |
} | |
) | |
costs.append(data['metrics']['accumulated_cost']) | |
# sort by instance_id | |
instances.sort(key=lambda x: x['instance_id']) | |
with open(res_file_path, 'w') as file: | |
for instance in instances: | |
file.write(json.dumps(instance) + '\n') | |
return passed, failed, costs | |
def classify_error(llm: LLM, failed_case: dict) -> str: | |
prompt = f""" | |
Please classify the error for the following failed case based on the history and eval_output: | |
Instruction: | |
{failed_case['instruction']} | |
Eval Script: | |
{failed_case['eval_script']}s | |
History: | |
{failed_case['history']} | |
Eval Output: | |
{failed_case['eval_output']} | |
The error categories are: | |
E1: Hallucination Errors - The model misinterpreted the user's intention, misplaced Python code and bash script, or generated random or irrelevant code. | |
E2: Lack of Knowledge or Information - The model lacks sufficient information or domain-specific knowledge to satisfy the user's requirements. | |
E3: Knowledge Manipulation - The model failed to integrate or manipulate information properly. | |
E4: Syntax Errors - The model generated code with syntax errors. | |
E5: Operational Error - The model gave up easily or exited without finishing the tasks. | |
Please provide only the error category (E1, E2, E3, E4, or E5) without any explanation. | |
""" | |
try: | |
response = llm.completion(messages=[{'content': prompt, 'role': 'user'}]) | |
error_category = response.choices[0].message['content'] | |
except Exception as e: | |
logger.error( | |
f'Failed to classify the error for the failed case: {failed_case["instance_id"]}' | |
) | |
logger.error(e) | |
error_category = input( | |
failed_case['instruction'] | |
+ ': ' | |
+ failed_case['eval_script'] | |
+ ' - ' | |
+ failed_case['eval_output'] | |
) | |
if error_category not in ['E1', 'E2', 'E3', 'E4', 'E5']: | |
raise ValueError(f'Invalid error category: {error_category}') | |
return error_category | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--json_file_path', | |
type=str, | |
required=True, | |
help='Path to the jsonl file containing the evaluation results', | |
) | |
args, _ = parser.parse_known_args() | |
# Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/swe_bench/README.md#configure-openhands-and-your-llm | |
# for details of how to set `llm_config` | |
if args.llm_config: | |
specified_llm_config = get_llm_config_arg(args.llm_config) | |
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
specified_llm_config.modify_params = False | |
if specified_llm_config: | |
config.llm = specified_llm_config | |
logger.info(f'Config for evaluation: {config}') | |
llm = LLM(llm_config=specified_llm_config) | |
passed, new_failed, costs = extract_test_results(args.json_file_path) | |
failed = [] | |
if os.path.exists(args.json_file_path.replace('.jsonl', '_failed.jsonl')): | |
with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'r') as file: | |
for line in file: | |
failed.append(json.loads(line.strip())) | |
print( | |
f'Loaded {len(failed)} failed cases from {args.json_file_path.replace(".jsonl", "_failed.jsonl")}' | |
) | |
for failed_case in tqdm.tqdm(new_failed): | |
if failed_case['instance_id'] in [case['instance_id'] for case in failed]: | |
continue | |
error_category = classify_error(llm, failed_case) | |
failed_case['error_category'] = error_category | |
failed.append(failed_case) | |
with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'a') as file: | |
file.write(json.dumps(failed_case) + '\n') | |
# Print the summary | |
print('Summary:') | |
print(f'Passed: {len(passed)}') | |
print(f'Failed: {len(failed)}') | |
print(f'Costs: {costs}') | |
print('Failed cases:') | |
error_categories = {} | |
for case in failed: | |
error_category = case['error_category'] | |
if error_category not in error_categories: | |
error_categories[error_category] = 0 | |
error_categories[error_category] += 1 | |
pprint.pprint(error_categories) | |