import pandas as pd import numpy as np import streamlit as st import faiss from sentence_transformers import SentenceTransformer from symspellpy import SymSpell, Verbosity # ---------------------- # Data Preparation # ---------------------- def preprocess_data(file_path): # Load dataset df = pd.read_csv(file_path) # Combine multi-value columns def combine_columns(row, prefix): values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])] return ', '.join(values) df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1) df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1) df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1) # Clean text text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class'] for col in text_columns: df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True) return df[['id', 'name', 'uses', 'substitutes', 'side_effects', 'Habit Forming', 'Therapeutic Class', 'Action Class']] # ---------------------- # Embedding & FAISS Setup # ---------------------- def setup_faiss(df): model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True) # Create FAISS index dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return model, index # ---------------------- # Spelling Correction # ---------------------- def setup_spell_checker(): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', term_index=0, count_index=1) return sym_spell # ---------------------- # Severity Analysis # ---------------------- SEVERITY_RANK = { 'vomiting': 3, 'nausea': 3, 'diarrhea': 3, 'dizziness': 2, 'headache': 2, 'palpitations': 2, 'rash': 1, 'itching': 1, 'fatigue': 1 } def severity_score(side_effects): return sum(SEVERITY_RANK.get(effect.strip().lower(), 0) for effect in side_effects.split(',') if effect.strip()) # ---------------------- # Drug Comparison # ---------------------- def compare_drugs(df, drug1, drug2): try: d1 = df[df['name'].str.lower() == drug1.lower()].iloc[0] d2 = df[df['name'].str.lower() == drug2.lower()].iloc[0] comparison = pd.DataFrame({ 'Attribute': ['Uses', 'Substitutes', 'Side Effects', 'Therapeutic Class'], drug1: [d1['uses'], d1['substitutes'], d1['side_effects'], d1['Therapeutic Class']], drug2: [d2['uses'], d2['substitutes'], d2['side_effects'], d2['Therapeutic Class']] }) return comparison except IndexError: return pd.DataFrame() # ---------------------- # Streamlit App # ---------------------- def main(): st.title("🧬 MedSearch NLP: Medicine Recommender System") # Load data and models df = preprocess_data('medicine_dataset.csv') model, faiss_index = setup_faiss(df) sym_spell = setup_spell_checker() # User input section query = st.text_input("Describe your symptoms or medical need:") therapeutic_class = st.selectbox( "Filter by Therapeutic Class (optional):", ['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist()) ) # Process query and show results if query: # Spelling correction suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2) if suggestions: query = suggestions[0].term st.info(f"Did you mean: '{query}'?") # Semantic search query_embedding = model.encode([query]) D, I = faiss_index.search(query_embedding, k=5) # Process results results = df.iloc[I[0]].copy() if therapeutic_class != 'All': results = results[results['Therapeutic Class'] == therapeutic_class] # Add severity analysis results['severity'] = results['side_effects'].apply(severity_score) results = results.sort_values('severity', ascending=True) # Display results st.subheader("Recommended Medicines") for _, row in results.iterrows(): with st.expander(f"💊 {row['name']} (Severity: {row['severity']})"): cols = st.columns(3) cols[0].write(f"**Uses:** {row['uses']}") cols[1].write(f"**Substitutes:** {row['substitutes']}") cols[2].write(f"**Side Effects:** {row['side_effects']}") cols2 = st.columns(2) cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}") cols2[1].write(f"Habit Forming: {row['Habit Forming']}") # Drug comparison section st.subheader("🔍 Drug Comparison Tool") col1, col2 = st.columns(2) drug_list = df['name'].unique().tolist() with col1: drug1 = st.selectbox("Select first drug:", drug_list, index=0) with col2: drug2 = st.selectbox("Select second drug:", drug_list, index=1 if len(drug_list) > 1 else 0) comparison_df = compare_drugs(df, drug1, drug2) if not comparison_df.empty: st.table(comparison_df.style.set_properties(**{ 'white-space': 'pre-wrap', 'text-align': 'left' })) else: st.warning("One or both selected drugs not found in database") if __name__ == "__main__": main()