Spaces:
Runtime error
Runtime error
File size: 1,867 Bytes
d6585f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import json
from tqdm import tqdm
QUERY_TYPE="Q0" # TREC format legacy
RELEVANCE_SCORE=1
PASSAGE_ID_VALID_PREFIX=["MARCO", "KILT"] # disable WAPO
### read: cast 22 eval json
eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json"
# read data
with open(eval_json_path, 'r') as fr:
data = json.load(fr)
# write: qrels.txt (format: {qid}\t{query})
eval_qrels_path = "/root/Corpus/CAsT22_eval_queries/cqr_qrels.txt"
qid_pid_pair_list = [] # filter out duplicate pair exists in evaluation file
with open(eval_qrels_path, 'w') as fw:
for sample in tqdm(data):
conv_id = sample['number']
for turn in sample['turn']:
turn_id = turn['number']
automatic_rewritten_utterance = turn['automatic_rewritten_utterance']
q_id = f"{conv_id}_{turn_id}"
if "provenance" in turn.keys():
for passage_id in turn["provenance"]:
if any([valid_prefix in passage_id for valid_prefix in PASSAGE_ID_VALID_PREFIX]):
if ' ' in passage_id:
print(f"delete whitespace in passage_id: {passage_id}")
passage_id = passage_id.replace(' ', '')
qid_pid_pair = f"{q_id}&{passage_id}"
if qid_pid_pair not in qid_pid_pair_list:
qid_pid_pair_list.append(qid_pid_pair)
fw.write(f"{q_id} {QUERY_TYPE} {passage_id} {RELEVANCE_SCORE}\n")
else:
print(f"skip appending duplicate qid&pid pair: qid = {q_id}, p_id = {passage_id}")
else:
print(f"exclude passage id: {passage_id}")
else:
print('no provenance for turn')
print(turn)
|