Spaces:

shukdevdatta123
/

MedSearch

Sleeping

File size: 3,646 Bytes

1f6b8ca

import pandas as pd
import numpy as np
import streamlit as st
import faiss
from sentence_transformers import SentenceTransformer
from symspellpy import SymSpell, Verbosity

# ----------------------
# Data Preparation
# ----------------------
def preprocess_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Combine multi-value columns
    def combine_columns(row, prefix):
        values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])]
        return ', '.join(values)
    
    df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1)
    df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1)
    df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1)
    
    # Clean text
    text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class']
    for col in text_columns:
        df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True)
    
    return df[['id', 'name', 'uses', 'substitutes', 'side_effects',
               'Habit Forming', 'Therapeutic Class', 'Action Class']]

# ----------------------
# Embedding & FAISS Setup
# ----------------------
def setup_faiss(df):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True)
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return model, index

# ----------------------
# Spelling Correction
# ----------------------
def setup_spell_checker():
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', 
                             term_index=0, count_index=1)
    return sym_spell

# ----------------------
# Streamlit App
# ----------------------
def main():
    st.title("🧬 MedSearch NLP: Medicine Recommender System")
    
    # Load data and models
    df = preprocess_data('medicine_dataset.csv')
    model, faiss_index = setup_faiss(df)
    sym_spell = setup_spell_checker()
    
    # User input
    query = st.text_input("Describe your symptoms or medical need:")
    therapeutic_class = st.selectbox(
        "Filter by Therapeutic Class (optional):",
        ['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist())
    )
    
    if query:
        # Spelling correction
        suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            query = suggestions[0].term
            st.info(f"Did you mean: '{query}'?")
        
        # Semantic search
        query_embedding = model.encode([query])
        D, I = faiss_index.search(query_embedding, k=5)
        
        # Filter results
        results = df.iloc[I[0]].copy()
        if therapeutic_class != 'All':
            results = results[results['Therapeutic Class'] == therapeutic_class]
        
        # Display results
        st.subheader("Recommended Medicines")
        for _, row in results.iterrows():
            with st.expander(f"💊 {row['name']}"):
                cols = st.columns(3)
                cols[0].write(f"**Uses:** {row['uses']}")
                cols[1].write(f"**Substitutes:** {row['substitutes']}")
                cols[2].write(f"**Side Effects:** {row['side_effects']}")
                
                cols2 = st.columns(2)
                cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}")
                cols2[1].write(f"Habit Forming: {row['Habit Forming']}")

if __name__ == "__main__":
    main()