Spaces:

tajuarAkash
/

Heath_Insurance_Fraud_Prediction

Sleeping

File size: 11,641 Bytes

#new
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP"  # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)

# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)

# Preprocessing for the user inputs
# Preprocessing for the user inputs
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP"  # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)

# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)

# Preprocessing for the user inputs
def preprocess_input(input_data, method="ml"):
    if method == "ml":
        # For Random Forest prediction, apply necessary transformations like scaling or encoding.
        # Convert ClaimDate to ordinal (number of days since a particular date)
        input_data['ClaimDate'] = pd.to_datetime(input_data['ClaimDate']).dt.toordinal()

        # Wrap each feature value in a list to create a valid DataFrame
        input_df = pd.DataFrame({
            'ClaimDate': [input_data['ClaimDate']],  # Now converted to ordinal value
            'ClaimAmount': [input_data['ClaimAmount']],
            'PatientAge': [input_data['PatientAge']],
            'PatientIncome': [input_data['PatientIncome']],
            'PatientGender': [input_data['PatientGender']],
            'ProviderSpecialty': [input_data['ProviderSpecialty']],
            'ClaimStatus': [input_data['ClaimStatus']],
            'PatientMaritalStatus': [input_data['PatientMaritalStatus']],
            'PatientEmploymentStatus': [input_data['PatientEmploymentStatus']],
            'ProviderLocation': [input_data['ProviderLocation']],
            'ClaimType': [input_data['ClaimType']],
            'ClaimSubmissionMethod': [input_data['ClaimSubmissionMethod']],
        })

        # Apply necessary preprocessing: Encoding and scaling (use the same scaler and encoders as in training)
        input_df['PatientGender'] = input_df['PatientGender'].apply(lambda x: 1 if x == 'Male' else 0)
        claim_status_mapping = {"Denied": 0, "Pending": 1, "Approved": 2}
        input_df['ClaimStatus'] = input_df['ClaimStatus'].map(claim_status_mapping)

        scaler = StandardScaler()
        input_scaled = scaler.fit_transform(input_df)  # Scaling the data

        return input_scaled

    elif method == "nlp":
        # For NLP-based prediction, concatenate features into a single paragraph
        claim_date = input_data['ClaimDate']
        claim_amount = input_data['ClaimAmount']
        patient_age = input_data['PatientAge']
        patient_gender = input_data['PatientGender']
        provider_specialty = input_data['ProviderSpecialty']
        claim_status = input_data['ClaimStatus']
        patient_income = input_data['PatientIncome']
        patient_marital_status = input_data['PatientMaritalStatus']
        patient_employment_status = input_data['PatientEmploymentStatus']
        provider_location = input_data['ProviderLocation']
        claim_type = input_data['ClaimType']
        claim_submission_method = input_data['ClaimSubmissionMethod']

        # Create a sentence (paragraph) using the input data
        input_text = f"The claim date is {claim_date}, with a claim amount of {claim_amount}. " \
                     f"The patient is {patient_age} years old, and their gender is {patient_gender}. " \
                     f"The provider specialty is {provider_specialty}. The claim status is {claim_status}. " \
                     f"The patient's income is {patient_income}, marital status is {patient_marital_status}, " \
                     f"and employment status is {patient_employment_status}. The provider location is {provider_location}. " \
                     f"The claim type is {claim_type}, and the claim submission method is {claim_submission_method}. " \
                     f"Claim legitimacy: {input_data['ClaimLegitimacy']}."
        
        # Tokenize the input text for NLP
        inputs = nlp_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        return inputs


# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")

# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))

# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
# ClaimLegitimacy is excluded from input (it’s the target that we want to predict)
# claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])

# Create a button to trigger prediction
if st.button('Predict'):
    input_data = {
        "ClaimDate": claim_date,
        "ClaimAmount": claim_amount,
        "PatientAge": patient_age,
        "PatientIncome": patient_income,
        "PatientGender": patient_gender,
        "ProviderSpecialty": provider_specialty,
        "ClaimStatus": claim_status,
        "PatientMaritalStatus": patient_marital_status,
        "PatientEmploymentStatus": patient_employment_status,
        "ProviderLocation": provider_location,
        "ClaimType": claim_type,
        "ClaimSubmissionMethod": claim_submission_method,
        # "ClaimLegitimacy": claim_legitimacy,  # Removed since it's the target we want to predict
    }

    # Preprocess the input data based on the selected method
    if prediction_method == "ML Prediction":
        input_scaled = preprocess_input(input_data, method="ml")
        
        # Get the prediction from the ML model (Random Forest)
        prediction = rf_model.predict(input_scaled)
        
        if prediction == 1:
            st.write("This claim is predicted to be **fraudulent** (ML model).")
        else:
            st.write("This claim is predicted to be **legitimate** (ML model).")
    
    elif prediction_method == "NLP Prediction":
        inputs = preprocess_input(input_data, method="nlp")

        # Get the prediction from the NLP model (BERT)
        with torch.no_grad():
            logits = nlp_model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=-1).item()

        if predicted_class == 1:
            st.write("This claim is predicted to be **fraudulent** (NLP model).")
        else:
            st.write("This claim is predicted to be **legitimate** (NLP model).")





# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")

# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))

# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])

# Create a button to trigger prediction
if st.button('Predict'):
    input_data = {
        "ClaimDate": claim_date,
        "ClaimAmount": claim_amount,
        "PatientAge": patient_age,
        "PatientIncome": patient_income,
        "PatientGender": patient_gender,
        "ProviderSpecialty": provider_specialty,
        "ClaimStatus": claim_status,
        "PatientMaritalStatus": patient_marital_status,
        "PatientEmploymentStatus": patient_employment_status,
        "ProviderLocation": provider_location,
        "ClaimType": claim_type,
        "ClaimSubmissionMethod": claim_submission_method,
        "ClaimLegitimacy": claim_legitimacy,
    }

    # Preprocess the input data based on the selected method
    if prediction_method == "ML Prediction":
        input_scaled = preprocess_input(input_data, method="ml")
        
        # Get the prediction from the ML model (Random Forest)
        prediction = rf_model.predict(input_scaled)
        
        if prediction == 1:
            st.write("This claim is predicted to be **fraudulent** (ML model).")
        else:
            st.write("This claim is predicted to be **legitimate** (ML model).")
    
    elif prediction_method == "NLP Prediction":
        inputs = preprocess_input(input_data, method="nlp")

        # Get the prediction from the NLP model (BERT)
        with torch.no_grad():
            logits = nlp_model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=-1).item()

        if predicted_class == 1:
            st.write("This claim is predicted to be **fraudulent** (NLP model).")
        else:
            st.write("This claim is predicted to be **legitimate** (NLP model).")