Spaces:

ceejaytheanalyst
/

Insurance_code_mapping

Sleeping

File size: 3,449 Bytes

0974b80
 
0c98cfe
e1b1664
0974b80
 
 
 
0c98cfe
 
0974b80
 
 
 
272a9aa
 
 
0c98cfe
 
 
272a9aa
 
b513d7d
272a9aa
0c98cfe
0974b80
 
 
b513d7d
 
0974b80
7fca223
e2a08b5
 
 
7fca223
0974b80
0c98cfe
0974b80
 
 
0c98cfe
 
0974b80
0c98cfe
0974b80
 
 
7107507
 
 
0974b80
0c98cfe
0974b80
 
0c98cfe
 
 
0974b80
 
0c98cfe
 
 
 
7fb1853
0c98cfe
 
 
 
 
 
 
 
0974b80

import streamlit as st
import torch
from sentence_transformers import SentenceTransformer, util
from spellchecker import SpellChecker
import pickle

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('neuml/pubmedbert-base-embeddings')

# Load stored data
with open("embeddings_1.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_embeddings = stored_data["embeddings"]

spell = SpellChecker()

# Define a function to check for misspelled words
def check_misspelled_words(user_input):
    # Tokenize the input into words
    words = user_input.split()

    # Get a list of misspelled words excluding words containing only numbers
    misspelled = [word for word in words if word.isalpha() and not word.isdigit() and not spell.correction(word.lower()) == word.lower()]

    return misspelled

# Define the function for mapping code
def mapping_code(user_input):
    if len(user_input.split()) <= 1:  # Check if sentence has less than 5 words
        raise ValueError("Input sentence should be more than 1 word long.Please provide the Full description")
    emb1 = model.encode(user_input.lower())
    #similarities = util.pytorch_cos_sim(emb1, stored_embeddings)[0]
    for sentence in stored_embeddings:
        similarity = util.cos_sim(sentence, emb1)
        similarities.append(similarity)

    # Combine similarity scores with 'code' and 'description'
    result = [(code, description, float(sim)) for code, description, sim in zip(stored_data["SBS_code"], stored_data["Description"], similarities)]
    # Sort results by similarity scores
    result.sort(key=lambda x: x[2], reverse=True)
    # Return top 5 entries with 'code', 'description', and 'similarity_score'
    num_results = min(5, len(result))
    top_5_results = [{"Code": code, "Description": description, "Similarity Score": sim} for code, description, sim in result[:num_results]]
    return top_5_results

# Streamlit frontend interface
def main():
    st.title("CPT Description Mapping")

    st.markdown("**Note:** Similarity scores are not absolute and should be further confirmed manually for accuracy.")

    # Input text box for user input
    user_input = st.text_input("Enter CPT description:", placeholder="Please enter a full description for better search results.")
    # Button to trigger mapping
    if st.button("Map"):
        if not user_input.strip():  # Check if input is empty or contains only whitespace
            st.error("Input box cannot be empty.")
        else:
            st.write("Please wait for a moment .... ")
            # Call backend function to get mapping results
            try:
                misspelled_words = check_misspelled_words(user_input)
                if misspelled_words:
                    st.write("Please enter a detailed correct full description")
                    st.write(f"Kindly check if these words are spelt correctly :{misspelled_words}")
                else:
                    mapping_results = mapping_code(user_input)
                    # Display top 5 similar sentences
                    st.write("Top 5 similar sentences:")
                    for i, result in enumerate(mapping_results, 1):
                        st.write(f"{i}. Code: {result['Code']}, Description: {result['Description']}, Similarity Score: {result['Similarity Score']:.4f}")
            except ValueError as e:
                st.error(str(e))

if __name__ == "__main__":
    main()