File size: 802 Bytes
ec2195e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/env python3
"""
Preprocess raw UMLS RRF/TSV into a simple CSV for indexing.
"""
import argparse
import pandas as pd

def main():
    p = argparse.ArgumentParser()
    p.add_argument('--input', required=True)
    p.add_argument('--output', required=True)
    p.add_argument('--sep', default='\t')
    args = p.parse_args()

    df = pd.read_csv(args.input, sep=args.sep, dtype=str)
    df = df.rename(columns={'CUI':'cui','STR':'name','DEF':'definition','SAB':'source'})
    df = df.dropna(subset=['cui','name'])
    df['definition'] = df['definition'].fillna('')
    df['source'] = df['source'].fillna('')
    df = df.drop_duplicates(subset=['cui','name'])
    df.to_csv(args.output, index=False)
    print(f'Wrote {len(df)} concepts to {args.output}')

if __name__ == '__main__':
    main()