Spaces:
Runtime error
Runtime error
File size: 2,830 Bytes
d6585f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import glob
import argparse
import xml.etree.ElementTree as ET
import re
import json
from pathlib import Path
from tqdm import tqdm
from ftfy import fix_text
def convert_collection(args):
print('converting collection....')
xml_list = list(Path(args.input_dir).rglob('*.xml'))
output_path = os.path.join(args.output_dir, 'trec21.json')
output_json_file = open(output_path, 'w', encoding='utf-8', newline='\n')
for i in tqdm(range(len(xml_list))):
parse_result = parse_xml(xml_list[i])
result_dict = {
'id': parse_result[0],
'contents': f'{parse_result[1]} {parse_result[2]} {parse_result[3]} {parse_result[4]} {parse_result[5]}',
'title': parse_result[1],
'condition': parse_result[2],
'summary': parse_result[3],
'detailed_description': parse_result[4],
'eligibility': parse_result[5]
}
output_json_file.write(json.dumps(result_dict) + '\n')
output_json_file.close()
def parse_xml(file_dir):
xml = ET.parse(file_dir)
doc_id = ''.join(xml.find('.//nct_id').itertext())
title = xml.find('.//official_title')
if not title:
title = xml.find('.//brief_title')
title = ''.join(title.itertext())
condition = xml.find('.//condition')
condition = ''.join(condition.itertext()) if condition else ''
summary = xml.find('.//brief_summary')
summary = ''.join(summary.itertext()) if summary else ''
detailed_description = xml.find('.//detailed_description')
detailed_description = ''.join(detailed_description.itertext()) if detailed_description else ''
eligibility = xml.find('.//eligibility/criteria')
eligibility = ''.join(eligibility.itertext()) if eligibility else ''
doc_id = re.sub('\s\s+'," ", doc_id)
title = re.sub('\s\s+'," ", title)
condition = re.sub('\s\s+'," ", condition)
summary = re.sub('\s\s+'," ", summary)
detailed_description = re.sub('\s\s+'," ", detailed_description)
eligibility = re.sub('\s\s+'," ", eligibility)
doc_id = fix_text(doc_id)
title = fix_text(title)
condition = fix_text(condition)
summary = fix_text(summary)
detailed_description = fix_text(detailed_description)
eligibility = fix_text(eligibility)
return [doc_id, title, condition, summary, detailed_description, eligibility]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", required=True, help='input directory to trec xml data files')
parser.add_argument('--output_dir', required=True, help='output folder for json files')
args = parser.parse_args()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
convert_collection(args)
print('Done!')
|