|
""" |
|
Preprocess raw UMLS data into a simplified CSV for FAISS indexing. |
|
Accepts a TSV/CSV with columns CUI, STR (string), DEF, SAB (source). |
|
Outputs a clean CSV with headers: cui,name,definition,source. |
|
|
|
Usage: |
|
python preprocess_umls_data.py \ |
|
--input raw_umls.tsv \ |
|
--output processed/concepts.csv \ |
|
--sep '\t' |
|
""" |
|
import argparse |
|
import pandas as pd |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Preprocess UMLS raw data.") |
|
parser.add_argument('--input', required=True, |
|
help='Raw UMLS file (TSV/CSV)') |
|
parser.add_argument('--output', required=True, |
|
help='Output CSV for concepts') |
|
parser.add_argument('--sep', default='\t', |
|
help='Separator for input file') |
|
args = parser.parse_args() |
|
|
|
|
|
df = pd.read_csv(args.input, sep=args.sep, dtype=str) |
|
|
|
|
|
df = df.rename(columns={ |
|
'CUI': 'cui', |
|
'STR': 'name', |
|
'DEF': 'definition', |
|
'SAB': 'source' |
|
}) |
|
|
|
|
|
df = df.dropna(subset=['cui', 'name']) |
|
|
|
|
|
df['definition'] = df['definition'].fillna('') |
|
df['source'] = df['source'].fillna('') |
|
|
|
|
|
df = df.drop_duplicates(subset=['cui', 'name']) |
|
|
|
|
|
df.to_csv(args.output, index=False) |
|
print(f"Processed {len(df)} concepts to {args.output}") |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|