Spaces:
Sleeping
Sleeping
File size: 5,636 Bytes
1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca 6ad155d 1f6b8ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import pandas as pd
import numpy as np
import streamlit as st
import faiss
from sentence_transformers import SentenceTransformer
from symspellpy import SymSpell, Verbosity
# ----------------------
# Data Preparation
# ----------------------
def preprocess_data(file_path):
# Load dataset
df = pd.read_csv(file_path)
# Combine multi-value columns
def combine_columns(row, prefix):
values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])]
return ', '.join(values)
df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1)
df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1)
df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1)
# Clean text
text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class']
for col in text_columns:
df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True)
return df[['id', 'name', 'uses', 'substitutes', 'side_effects',
'Habit Forming', 'Therapeutic Class', 'Action Class']]
# ----------------------
# Embedding & FAISS Setup
# ----------------------
def setup_faiss(df):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return model, index
# ----------------------
# Spelling Correction
# ----------------------
def setup_spell_checker():
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt',
term_index=0, count_index=1)
return sym_spell
# ----------------------
# Severity Analysis
# ----------------------
SEVERITY_RANK = {
'vomiting': 3, 'nausea': 3, 'diarrhea': 3,
'dizziness': 2, 'headache': 2, 'palpitations': 2,
'rash': 1, 'itching': 1, 'fatigue': 1
}
def severity_score(side_effects):
return sum(SEVERITY_RANK.get(effect.strip().lower(), 0)
for effect in side_effects.split(',') if effect.strip())
# ----------------------
# Drug Comparison
# ----------------------
def compare_drugs(df, drug1, drug2):
try:
d1 = df[df['name'].str.lower() == drug1.lower()].iloc[0]
d2 = df[df['name'].str.lower() == drug2.lower()].iloc[0]
comparison = pd.DataFrame({
'Attribute': ['Uses', 'Substitutes', 'Side Effects', 'Therapeutic Class'],
drug1: [d1['uses'], d1['substitutes'], d1['side_effects'], d1['Therapeutic Class']],
drug2: [d2['uses'], d2['substitutes'], d2['side_effects'], d2['Therapeutic Class']]
})
return comparison
except IndexError:
return pd.DataFrame()
# ----------------------
# Streamlit App
# ----------------------
def main():
st.title("𧬠MedSearch NLP: Medicine Recommender System")
# Load data and models
df = preprocess_data('medicine_dataset.csv')
model, faiss_index = setup_faiss(df)
sym_spell = setup_spell_checker()
# User input section
query = st.text_input("Describe your symptoms or medical need:")
therapeutic_class = st.selectbox(
"Filter by Therapeutic Class (optional):",
['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist())
)
# Process query and show results
if query:
# Spelling correction
suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
query = suggestions[0].term
st.info(f"Did you mean: '{query}'?")
# Semantic search
query_embedding = model.encode([query])
D, I = faiss_index.search(query_embedding, k=5)
# Process results
results = df.iloc[I[0]].copy()
if therapeutic_class != 'All':
results = results[results['Therapeutic Class'] == therapeutic_class]
# Add severity analysis
results['severity'] = results['side_effects'].apply(severity_score)
results = results.sort_values('severity', ascending=True)
# Display results
st.subheader("Recommended Medicines")
for _, row in results.iterrows():
with st.expander(f"π {row['name']} (Severity: {row['severity']})"):
cols = st.columns(3)
cols[0].write(f"**Uses:** {row['uses']}")
cols[1].write(f"**Substitutes:** {row['substitutes']}")
cols[2].write(f"**Side Effects:** {row['side_effects']}")
cols2 = st.columns(2)
cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}")
cols2[1].write(f"Habit Forming: {row['Habit Forming']}")
# Drug comparison section
st.subheader("π Drug Comparison Tool")
col1, col2 = st.columns(2)
drug_list = df['name'].unique().tolist()
with col1:
drug1 = st.selectbox("Select first drug:", drug_list, index=0)
with col2:
drug2 = st.selectbox("Select second drug:", drug_list, index=1 if len(drug_list) > 1 else 0)
comparison_df = compare_drugs(df, drug1, drug2)
if not comparison_df.empty:
st.table(comparison_df.style.set_properties(**{
'white-space': 'pre-wrap',
'text-align': 'left'
}))
else:
st.warning("One or both selected drugs not found in database")
if __name__ == "__main__":
main() |