""" Preprocess raw UMLS data into a simplified CSV for FAISS indexing. Accepts a TSV/CSV with columns CUI, STR (string), DEF, SAB (source). Outputs a clean CSV with headers: cui,name,definition,source. Usage: python preprocess_umls_data.py \ --input raw_umls.tsv \ --output processed/concepts.csv \ --sep '\t' """ import argparse import pandas as pd def main(): parser = argparse.ArgumentParser(description="Preprocess UMLS raw data.") parser.add_argument('--input', required=True, help='Raw UMLS file (TSV/CSV)') parser.add_argument('--output', required=True, help='Output CSV for concepts') parser.add_argument('--sep', default='\t', help='Separator for input file') args = parser.parse_args() # Read raw file df = pd.read_csv(args.input, sep=args.sep, dtype=str) # Expect columns: CUI, STR, DEF, SAB df = df.rename(columns={ 'CUI': 'cui', 'STR': 'name', 'DEF': 'definition', 'SAB': 'source' }) # Drop rows missing CUI or name df = df.dropna(subset=['cui', 'name']) # Fill missing definitions/sources df['definition'] = df['definition'].fillna('') df['source'] = df['source'].fillna('') # Deduplicate by CUI + name df = df.drop_duplicates(subset=['cui', 'name']) # Output df.to_csv(args.output, index=False) print(f"Processed {len(df)} concepts to {args.output}") if __name__ == '__main__': main()