File size: 841 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import argparse
import xml.etree.ElementTree as ET
import re
from ftfy import fix_text

def load_topic_trec(args):
    xml = ET.parse(args.topics)
    root = xml.getroot()
    with open(args.queries, 'w') as fout:
        for child in root:
            qid = child.attrib['number']
            topic = repr(child.text)[1:-1]
            topic = fix_text(topic)
            topic = topic.replace('\\n', ' ')
            topic = re.sub('\s\s+'," ",topic)
            fout.write(f"{qid}\t{topic}\n")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--topics", required=True, type=str, help='topic file')
    parser.add_argument('--queries', required=True, type=str, help='convert to qid\\tquery tsv format')
    
    
    args = parser.parse_args()
    
    load_topic_trec(args)

    print('Done!')