MedSearch / app.py
shukdevdatta123's picture
Update app.py
6ad155d verified
import pandas as pd
import numpy as np
import streamlit as st
import faiss
from sentence_transformers import SentenceTransformer
from symspellpy import SymSpell, Verbosity
# ----------------------
# Data Preparation
# ----------------------
def preprocess_data(file_path):
# Load dataset
df = pd.read_csv(file_path)
# Combine multi-value columns
def combine_columns(row, prefix):
values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])]
return ', '.join(values)
df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1)
df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1)
df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1)
# Clean text
text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class']
for col in text_columns:
df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True)
return df[['id', 'name', 'uses', 'substitutes', 'side_effects',
'Habit Forming', 'Therapeutic Class', 'Action Class']]
# ----------------------
# Embedding & FAISS Setup
# ----------------------
def setup_faiss(df):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return model, index
# ----------------------
# Spelling Correction
# ----------------------
def setup_spell_checker():
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt',
term_index=0, count_index=1)
return sym_spell
# ----------------------
# Severity Analysis
# ----------------------
SEVERITY_RANK = {
'vomiting': 3, 'nausea': 3, 'diarrhea': 3,
'dizziness': 2, 'headache': 2, 'palpitations': 2,
'rash': 1, 'itching': 1, 'fatigue': 1
}
def severity_score(side_effects):
return sum(SEVERITY_RANK.get(effect.strip().lower(), 0)
for effect in side_effects.split(',') if effect.strip())
# ----------------------
# Drug Comparison
# ----------------------
def compare_drugs(df, drug1, drug2):
try:
d1 = df[df['name'].str.lower() == drug1.lower()].iloc[0]
d2 = df[df['name'].str.lower() == drug2.lower()].iloc[0]
comparison = pd.DataFrame({
'Attribute': ['Uses', 'Substitutes', 'Side Effects', 'Therapeutic Class'],
drug1: [d1['uses'], d1['substitutes'], d1['side_effects'], d1['Therapeutic Class']],
drug2: [d2['uses'], d2['substitutes'], d2['side_effects'], d2['Therapeutic Class']]
})
return comparison
except IndexError:
return pd.DataFrame()
# ----------------------
# Streamlit App
# ----------------------
def main():
st.title("🧬 MedSearch NLP: Medicine Recommender System")
# Load data and models
df = preprocess_data('medicine_dataset.csv')
model, faiss_index = setup_faiss(df)
sym_spell = setup_spell_checker()
# User input section
query = st.text_input("Describe your symptoms or medical need:")
therapeutic_class = st.selectbox(
"Filter by Therapeutic Class (optional):",
['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist())
)
# Process query and show results
if query:
# Spelling correction
suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
query = suggestions[0].term
st.info(f"Did you mean: '{query}'?")
# Semantic search
query_embedding = model.encode([query])
D, I = faiss_index.search(query_embedding, k=5)
# Process results
results = df.iloc[I[0]].copy()
if therapeutic_class != 'All':
results = results[results['Therapeutic Class'] == therapeutic_class]
# Add severity analysis
results['severity'] = results['side_effects'].apply(severity_score)
results = results.sort_values('severity', ascending=True)
# Display results
st.subheader("Recommended Medicines")
for _, row in results.iterrows():
with st.expander(f"πŸ’Š {row['name']} (Severity: {row['severity']})"):
cols = st.columns(3)
cols[0].write(f"**Uses:** {row['uses']}")
cols[1].write(f"**Substitutes:** {row['substitutes']}")
cols[2].write(f"**Side Effects:** {row['side_effects']}")
cols2 = st.columns(2)
cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}")
cols2[1].write(f"Habit Forming: {row['Habit Forming']}")
# Drug comparison section
st.subheader("πŸ” Drug Comparison Tool")
col1, col2 = st.columns(2)
drug_list = df['name'].unique().tolist()
with col1:
drug1 = st.selectbox("Select first drug:", drug_list, index=0)
with col2:
drug2 = st.selectbox("Select second drug:", drug_list, index=1 if len(drug_list) > 1 else 0)
comparison_df = compare_drugs(df, drug1, drug2)
if not comparison_df.empty:
st.table(comparison_df.style.set_properties(**{
'white-space': 'pre-wrap',
'text-align': 'left'
}))
else:
st.warning("One or both selected drugs not found in database")
if __name__ == "__main__":
main()