Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import streamlit as st | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
from symspellpy import SymSpell, Verbosity | |
# ---------------------- | |
# Data Preparation | |
# ---------------------- | |
def preprocess_data(file_path): | |
# Load dataset | |
df = pd.read_csv(file_path) | |
# Combine multi-value columns | |
def combine_columns(row, prefix): | |
values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])] | |
return ', '.join(values) | |
df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1) | |
df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1) | |
df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1) | |
# Clean text | |
text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class'] | |
for col in text_columns: | |
df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True) | |
return df[['id', 'name', 'uses', 'substitutes', 'side_effects', | |
'Habit Forming', 'Therapeutic Class', 'Action Class']] | |
# ---------------------- | |
# Embedding & FAISS Setup | |
# ---------------------- | |
def setup_faiss(df): | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True) | |
# Create FAISS index | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
return model, index | |
# ---------------------- | |
# Spelling Correction | |
# ---------------------- | |
def setup_spell_checker(): | |
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) | |
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', | |
term_index=0, count_index=1) | |
return sym_spell | |
# ---------------------- | |
# Severity Analysis | |
# ---------------------- | |
SEVERITY_RANK = { | |
'vomiting': 3, 'nausea': 3, 'diarrhea': 3, | |
'dizziness': 2, 'headache': 2, 'palpitations': 2, | |
'rash': 1, 'itching': 1, 'fatigue': 1 | |
} | |
def severity_score(side_effects): | |
return sum(SEVERITY_RANK.get(effect.strip().lower(), 0) | |
for effect in side_effects.split(',') if effect.strip()) | |
# ---------------------- | |
# Drug Comparison | |
# ---------------------- | |
def compare_drugs(df, drug1, drug2): | |
try: | |
d1 = df[df['name'].str.lower() == drug1.lower()].iloc[0] | |
d2 = df[df['name'].str.lower() == drug2.lower()].iloc[0] | |
comparison = pd.DataFrame({ | |
'Attribute': ['Uses', 'Substitutes', 'Side Effects', 'Therapeutic Class'], | |
drug1: [d1['uses'], d1['substitutes'], d1['side_effects'], d1['Therapeutic Class']], | |
drug2: [d2['uses'], d2['substitutes'], d2['side_effects'], d2['Therapeutic Class']] | |
}) | |
return comparison | |
except IndexError: | |
return pd.DataFrame() | |
# ---------------------- | |
# Streamlit App | |
# ---------------------- | |
def main(): | |
st.title("𧬠MedSearch NLP: Medicine Recommender System") | |
# Load data and models | |
df = preprocess_data('medicine_dataset.csv') | |
model, faiss_index = setup_faiss(df) | |
sym_spell = setup_spell_checker() | |
# User input section | |
query = st.text_input("Describe your symptoms or medical need:") | |
therapeutic_class = st.selectbox( | |
"Filter by Therapeutic Class (optional):", | |
['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist()) | |
) | |
# Process query and show results | |
if query: | |
# Spelling correction | |
suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2) | |
if suggestions: | |
query = suggestions[0].term | |
st.info(f"Did you mean: '{query}'?") | |
# Semantic search | |
query_embedding = model.encode([query]) | |
D, I = faiss_index.search(query_embedding, k=5) | |
# Process results | |
results = df.iloc[I[0]].copy() | |
if therapeutic_class != 'All': | |
results = results[results['Therapeutic Class'] == therapeutic_class] | |
# Add severity analysis | |
results['severity'] = results['side_effects'].apply(severity_score) | |
results = results.sort_values('severity', ascending=True) | |
# Display results | |
st.subheader("Recommended Medicines") | |
for _, row in results.iterrows(): | |
with st.expander(f"π {row['name']} (Severity: {row['severity']})"): | |
cols = st.columns(3) | |
cols[0].write(f"**Uses:** {row['uses']}") | |
cols[1].write(f"**Substitutes:** {row['substitutes']}") | |
cols[2].write(f"**Side Effects:** {row['side_effects']}") | |
cols2 = st.columns(2) | |
cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}") | |
cols2[1].write(f"Habit Forming: {row['Habit Forming']}") | |
# Drug comparison section | |
st.subheader("π Drug Comparison Tool") | |
col1, col2 = st.columns(2) | |
drug_list = df['name'].unique().tolist() | |
with col1: | |
drug1 = st.selectbox("Select first drug:", drug_list, index=0) | |
with col2: | |
drug2 = st.selectbox("Select second drug:", drug_list, index=1 if len(drug_list) > 1 else 0) | |
comparison_df = compare_drugs(df, drug1, drug2) | |
if not comparison_df.empty: | |
st.table(comparison_df.style.set_properties(**{ | |
'white-space': 'pre-wrap', | |
'text-align': 'left' | |
})) | |
else: | |
st.warning("One or both selected drugs not found in database") | |
if __name__ == "__main__": | |
main() |