Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import streamlit as st | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
from symspellpy import SymSpell, Verbosity | |
# ---------------------- | |
# Data Preparation | |
# ---------------------- | |
def preprocess_data(file_path): | |
# Load dataset | |
df = pd.read_csv(file_path) | |
# Combine multi-value columns | |
def combine_columns(row, prefix): | |
values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])] | |
return ', '.join(values) | |
df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1) | |
df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1) | |
df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1) | |
# Clean text | |
text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class'] | |
for col in text_columns: | |
df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True) | |
return df[['id', 'name', 'uses', 'substitutes', 'side_effects', | |
'Habit Forming', 'Therapeutic Class', 'Action Class']] | |
# ---------------------- | |
# Embedding & FAISS Setup | |
# ---------------------- | |
def setup_faiss(df): | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True) | |
# Create FAISS index | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
return model, index | |
# ---------------------- | |
# Spelling Correction | |
# ---------------------- | |
def setup_spell_checker(): | |
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) | |
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', | |
term_index=0, count_index=1) | |
return sym_spell | |
# ---------------------- | |
# Streamlit App | |
# ---------------------- | |
def main(): | |
st.title("𧬠MedSearch NLP: Medicine Recommender System") | |
# Load data and models | |
df = preprocess_data('medicine_dataset.csv') | |
model, faiss_index = setup_faiss(df) | |
sym_spell = setup_spell_checker() | |
# User input | |
query = st.text_input("Describe your symptoms or medical need:") | |
therapeutic_class = st.selectbox( | |
"Filter by Therapeutic Class (optional):", | |
['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist()) | |
) | |
if query: | |
# Spelling correction | |
suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2) | |
if suggestions: | |
query = suggestions[0].term | |
st.info(f"Did you mean: '{query}'?") | |
# Semantic search | |
query_embedding = model.encode([query]) | |
D, I = faiss_index.search(query_embedding, k=5) | |
# Filter results | |
results = df.iloc[I[0]].copy() | |
if therapeutic_class != 'All': | |
results = results[results['Therapeutic Class'] == therapeutic_class] | |
# Display results | |
st.subheader("Recommended Medicines") | |
for _, row in results.iterrows(): | |
with st.expander(f"π {row['name']}"): | |
cols = st.columns(3) | |
cols[0].write(f"**Uses:** {row['uses']}") | |
cols[1].write(f"**Substitutes:** {row['substitutes']}") | |
cols[2].write(f"**Side Effects:** {row['side_effects']}") | |
cols2 = st.columns(2) | |
cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}") | |
cols2[1].write(f"Habit Forming: {row['Habit Forming']}") | |
if __name__ == "__main__": | |
main() |