dtcxzyw's picture
Update
913f726 unverified
import glob
import json
import os
from dataclasses import dataclass
from src.display.formatting import make_hyperlink
from src.display.utils import AutoEvalColumn
@dataclass
class EvalResult:
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
method_name: str
method_url: str
model_name: str
model_url: str
with_hint: bool
attempts: int
fast_pass_count: int
full_pass_count: int
full_pass_count_crash: int
full_pass_count_hang: int
full_pass_count_miscompilation: int
build_count: int
build_failure_count: int
mttr: float
sample_count: float
fixed_bug_ids: list[str]
fixed_bug_ids_fast: list[str]
@classmethod
def init_from_json_file(self, json_filepath):
"""Inits the result from the specific model result file"""
with open(json_filepath) as fp:
data = json.load(fp)
method_name = data.get("method_name", "")
method_url = data.get("method_url", "")
model_name = data.get("base_model_name", "")
model_url = data.get("base_model_url", "")
with_hint = data.get("with_hint", False)
fixes = data.get("fixes", [])
attempts = len(fixes)
fast_pass_count = 0
full_pass_count = 0
full_pass_count_cat = {}
build_count = 0
build_failure_count = 0
ttr_sum = 0
fixed_bug_ids = []
fixed_bug_ids_fast = []
sample_count = 0
for fix in fixes:
bug_type = fix.get("bug_type", "")
if fix.get("fast_check_pass", False):
fast_pass_count += 1
fixed_bug_ids_fast.append(fix.get("bug_id", ""))
if fix.get("full_check_pass", False):
full_pass_count += 1
full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
ttr_sum += fix.get("wall_time", 0)
fixed_bug_ids.append(fix.get("bug_id", ""))
sample_count += fix.get("fast_check_count", 0) + fix.get("full_check_count", 0)
build_count += fix.get("build_count", 0)
build_failure_count += fix.get("build_failure_count", 0)
return self(
method_name=method_name,
method_url=method_url,
model_name=model_name,
model_url=model_url,
with_hint=with_hint,
attempts=attempts,
fast_pass_count=fast_pass_count,
full_pass_count=full_pass_count,
full_pass_count_crash=full_pass_count_cat.get("crash", 0),
full_pass_count_hang=full_pass_count_cat.get("hang", 0),
full_pass_count_miscompilation=full_pass_count_cat.get("miscompilation", 0),
build_count=build_count,
build_failure_count=build_failure_count,
mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
fixed_bug_ids=fixed_bug_ids,
fixed_bug_ids_fast=fixed_bug_ids_fast,
sample_count=round(sample_count / full_pass_count, 1) if full_pass_count > 0 else 0,
)
def to_dict(self, total_issues):
"""Converts the Eval Result to a dict compatible with our dataframe display"""
data_dict = {
AutoEvalColumn.method_name.name: make_hyperlink(self.method_url, self.method_name),
AutoEvalColumn.model_name.name: make_hyperlink(self.model_url, self.model_name),
AutoEvalColumn.with_hint.name: "w/ hint" if self.with_hint else "w/o hint",
AutoEvalColumn.score.name: round(self.full_pass_count * 100.0 / total_issues, 1),
AutoEvalColumn.attempts.name: self.attempts,
AutoEvalColumn.fast_pass_count.name: self.fast_pass_count,
AutoEvalColumn.full_pass_count.name: self.full_pass_count,
AutoEvalColumn.full_pass_count_crash.name: self.full_pass_count_crash,
AutoEvalColumn.full_pass_count_hang.name: self.full_pass_count_hang,
AutoEvalColumn.full_pass_count_miscompilation.name: self.full_pass_count_miscompilation,
AutoEvalColumn.build_success_rate.name: round(
(self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
),
AutoEvalColumn.mttr.name: self.mttr,
"fixed_bug_ids": self.fixed_bug_ids,
"fixed_bug_ids_fast": self.fixed_bug_ids_fast,
"method_id": self.method_name + "(" + self.model_name + ")",
AutoEvalColumn.sample_count.name: self.sample_count,
}
return data_dict
def get_raw_eval_results(requests_path: str) -> list[EvalResult]:
"""From the path of the results folder root, extract all needed info for results"""
results = []
for root, _, files in os.walk(requests_path):
for file in files:
if file.endswith(".json"):
results.append(EvalResult.init_from_json_file(os.path.join(root, file)))
return results