mgbam commited on
Commit
ec2195e
·
verified ·
1 Parent(s): cad86c8

Create utils/preprocess_umls_data.py

Browse files
Files changed (1) hide show
  1. utils/preprocess_umls_data.py +25 -0
utils/preprocess_umls_data.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess raw UMLS RRF/TSV into a simple CSV for indexing.
4
+ """
5
+ import argparse
6
+ import pandas as pd
7
+
8
+ def main():
9
+ p = argparse.ArgumentParser()
10
+ p.add_argument('--input', required=True)
11
+ p.add_argument('--output', required=True)
12
+ p.add_argument('--sep', default='\t')
13
+ args = p.parse_args()
14
+
15
+ df = pd.read_csv(args.input, sep=args.sep, dtype=str)
16
+ df = df.rename(columns={'CUI':'cui','STR':'name','DEF':'definition','SAB':'source'})
17
+ df = df.dropna(subset=['cui','name'])
18
+ df['definition'] = df['definition'].fillna('')
19
+ df['source'] = df['source'].fillna('')
20
+ df = df.drop_duplicates(subset=['cui','name'])
21
+ df.to_csv(args.output, index=False)
22
+ print(f'Wrote {len(df)} concepts to {args.output}')
23
+
24
+ if __name__ == '__main__':
25
+ main()