File size: 2,830 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import glob
import argparse
import xml.etree.ElementTree as ET
import re
import json
from pathlib import Path
from tqdm import tqdm
from ftfy import fix_text

def convert_collection(args):
    print('converting collection....')
    xml_list = list(Path(args.input_dir).rglob('*.xml'))
    output_path = os.path.join(args.output_dir, 'trec21.json')
    output_json_file = open(output_path, 'w', encoding='utf-8', newline='\n')
    for i in tqdm(range(len(xml_list))):
        parse_result = parse_xml(xml_list[i])
        result_dict = {
                'id': parse_result[0],
                'contents': f'{parse_result[1]} {parse_result[2]} {parse_result[3]} {parse_result[4]} {parse_result[5]}',
                'title': parse_result[1],
                'condition': parse_result[2],
                'summary': parse_result[3],
                'detailed_description': parse_result[4],
                'eligibility': parse_result[5]
        }
        output_json_file.write(json.dumps(result_dict) + '\n')
    output_json_file.close()


def parse_xml(file_dir):
    xml = ET.parse(file_dir)
    doc_id = ''.join(xml.find('.//nct_id').itertext())
    title = xml.find('.//official_title')
    if not title:
        title = xml.find('.//brief_title')
    title = ''.join(title.itertext())
    condition = xml.find('.//condition')
    condition = ''.join(condition.itertext()) if condition else ''
    summary = xml.find('.//brief_summary')
    summary = ''.join(summary.itertext()) if summary else ''
    detailed_description = xml.find('.//detailed_description')
    detailed_description = ''.join(detailed_description.itertext()) if detailed_description else ''
    eligibility = xml.find('.//eligibility/criteria')
    eligibility = ''.join(eligibility.itertext()) if eligibility else ''

    doc_id = re.sub('\s\s+'," ", doc_id)
    title = re.sub('\s\s+'," ", title)
    condition = re.sub('\s\s+'," ", condition)
    summary = re.sub('\s\s+'," ", summary)
    detailed_description = re.sub('\s\s+'," ", detailed_description)
    eligibility = re.sub('\s\s+'," ", eligibility)
    doc_id = fix_text(doc_id)
    title = fix_text(title)
    condition = fix_text(condition)
    summary = fix_text(summary)
    detailed_description = fix_text(detailed_description)
    eligibility = fix_text(eligibility)
    return [doc_id, title, condition, summary, detailed_description, eligibility]

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_dir", required=True, help='input directory to trec xml data files')
    parser.add_argument('--output_dir', required=True, help='output folder for json files')
    
    
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    convert_collection(args)
    print('Done!')