Spaces:
Sleeping
Sleeping
File size: 3,646 Bytes
1f6b8ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import pandas as pd
import numpy as np
import streamlit as st
import faiss
from sentence_transformers import SentenceTransformer
from symspellpy import SymSpell, Verbosity
# ----------------------
# Data Preparation
# ----------------------
def preprocess_data(file_path):
# Load dataset
df = pd.read_csv(file_path)
# Combine multi-value columns
def combine_columns(row, prefix):
values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])]
return ', '.join(values)
df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1)
df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1)
df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1)
# Clean text
text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class']
for col in text_columns:
df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True)
return df[['id', 'name', 'uses', 'substitutes', 'side_effects',
'Habit Forming', 'Therapeutic Class', 'Action Class']]
# ----------------------
# Embedding & FAISS Setup
# ----------------------
def setup_faiss(df):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return model, index
# ----------------------
# Spelling Correction
# ----------------------
def setup_spell_checker():
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt',
term_index=0, count_index=1)
return sym_spell
# ----------------------
# Streamlit App
# ----------------------
def main():
st.title("🧬 MedSearch NLP: Medicine Recommender System")
# Load data and models
df = preprocess_data('medicine_dataset.csv')
model, faiss_index = setup_faiss(df)
sym_spell = setup_spell_checker()
# User input
query = st.text_input("Describe your symptoms or medical need:")
therapeutic_class = st.selectbox(
"Filter by Therapeutic Class (optional):",
['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist())
)
if query:
# Spelling correction
suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
query = suggestions[0].term
st.info(f"Did you mean: '{query}'?")
# Semantic search
query_embedding = model.encode([query])
D, I = faiss_index.search(query_embedding, k=5)
# Filter results
results = df.iloc[I[0]].copy()
if therapeutic_class != 'All':
results = results[results['Therapeutic Class'] == therapeutic_class]
# Display results
st.subheader("Recommended Medicines")
for _, row in results.iterrows():
with st.expander(f"💊 {row['name']}"):
cols = st.columns(3)
cols[0].write(f"**Uses:** {row['uses']}")
cols[1].write(f"**Substitutes:** {row['substitutes']}")
cols[2].write(f"**Side Effects:** {row['side_effects']}")
cols2 = st.columns(2)
cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}")
cols2[1].write(f"Habit Forming: {row['Habit Forming']}")
if __name__ == "__main__":
main() |