import sys import numpy as np DATASETS1 = [ "qmsum", "qasper", "quality", 'musique', 'hotpotqa', 'multifieldqa_en' ] DATASETS = [ "qmsum", "qasper", "quality", 'musique', 'hotpotqa', 'multifieldqa_en', ] outrow = '' data2res = dict() def average(data2res): sumvalue = 0.0 sumnum = 0.0 for adata in data2res: avalue = data2res[adata] sumvalue += avalue sumnum += 1 assert sumnum > 0.0 return sumvalue/sumnum def collect(value_list, outrow, data2res): #print(value_list) # first add the single avg score: avg = round(np.mean(value_list), 4) outrow += str(avg) + ' ' avg2 = average(data2res) avg2 = round(avg2, 4) outrow += str(avg2) + ' ' for adata in DATASETS: ares = data2res[adata] if adata in data2res else "NA" outrow += str(ares) + " " print(outrow.strip()) print('system avg6 avg6 ' + ' '.join(DATASETS)) #infn = "eval_retro_2.sh.log.2" #with open(infn) as br: #for aline in br.readlines(): value_list = list() for aline in sys.stdin: #import ipdb; ipdb.set_trace() aline = aline.strip() if 'final display' in aline: if '-baseline-' in aline or '-ret-' in aline: if len(outrow) > 0 and len(data2res) > 0: collect(value_list, outrow, data2res) outrow = "" # reset data2res = dict() value_list = list() aline2 = aline.replace('-', '') aline2 = aline2.replace('final display', '') outrow += aline2 + ' ' continue cols = aline.split(' ') adata = cols[2] ares = '/'.join(cols[3:]) # NOTE use one geometric_mean instead scores = cols[3:] # for R1/R2/RL geometric_mean: if len(scores) == 3: scores = [float(item) for item in scores] geo_mean = (scores[0] * scores[1] * scores[2]) ** (1.0 / 3.0) ares = str(round(geo_mean, 4)) data2res[adata] = float(ares) value_list.append(float(ares)) collect(value_list, outrow, data2res)