UMLS / utils /preprocess_umls_data.py
mgbam's picture
Update utils/preprocess_umls_data.py
36083ff verified
raw
history blame
1.51 kB
"""
Preprocess raw UMLS data into a simplified CSV for FAISS indexing.
Accepts a TSV/CSV with columns CUI, STR (string), DEF, SAB (source).
Outputs a clean CSV with headers: cui,name,definition,source.
Usage:
python preprocess_umls_data.py \
--input raw_umls.tsv \
--output processed/concepts.csv \
--sep '\t'
"""
import argparse
import pandas as pd
def main():
parser = argparse.ArgumentParser(description="Preprocess UMLS raw data.")
parser.add_argument('--input', required=True,
help='Raw UMLS file (TSV/CSV)')
parser.add_argument('--output', required=True,
help='Output CSV for concepts')
parser.add_argument('--sep', default='\t',
help='Separator for input file')
args = parser.parse_args()
# Read raw file
df = pd.read_csv(args.input, sep=args.sep, dtype=str)
# Expect columns: CUI, STR, DEF, SAB
df = df.rename(columns={
'CUI': 'cui',
'STR': 'name',
'DEF': 'definition',
'SAB': 'source'
})
# Drop rows missing CUI or name
df = df.dropna(subset=['cui', 'name'])
# Fill missing definitions/sources
df['definition'] = df['definition'].fillna('')
df['source'] = df['source'].fillna('')
# Deduplicate by CUI + name
df = df.drop_duplicates(subset=['cui', 'name'])
# Output
df.to_csv(args.output, index=False)
print(f"Processed {len(df)} concepts to {args.output}")
if __name__ == '__main__':
main()