tajuarAkash's picture
Update app.py
2393287 verified
#new
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP" # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)
# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)
# Preprocessing for the user inputs
# Preprocessing for the user inputs
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP" # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)
# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)
# Preprocessing for the user inputs
def preprocess_input(input_data, method="ml"):
if method == "ml":
# For Random Forest prediction, apply necessary transformations like scaling or encoding.
# Convert ClaimDate to ordinal (number of days since a particular date)
input_data['ClaimDate'] = pd.to_datetime(input_data['ClaimDate']).dt.toordinal()
# Wrap each feature value in a list to create a valid DataFrame
input_df = pd.DataFrame({
'ClaimDate': [input_data['ClaimDate']], # Now converted to ordinal value
'ClaimAmount': [input_data['ClaimAmount']],
'PatientAge': [input_data['PatientAge']],
'PatientIncome': [input_data['PatientIncome']],
'PatientGender': [input_data['PatientGender']],
'ProviderSpecialty': [input_data['ProviderSpecialty']],
'ClaimStatus': [input_data['ClaimStatus']],
'PatientMaritalStatus': [input_data['PatientMaritalStatus']],
'PatientEmploymentStatus': [input_data['PatientEmploymentStatus']],
'ProviderLocation': [input_data['ProviderLocation']],
'ClaimType': [input_data['ClaimType']],
'ClaimSubmissionMethod': [input_data['ClaimSubmissionMethod']],
})
# Apply necessary preprocessing: Encoding and scaling (use the same scaler and encoders as in training)
input_df['PatientGender'] = input_df['PatientGender'].apply(lambda x: 1 if x == 'Male' else 0)
claim_status_mapping = {"Denied": 0, "Pending": 1, "Approved": 2}
input_df['ClaimStatus'] = input_df['ClaimStatus'].map(claim_status_mapping)
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input_df) # Scaling the data
return input_scaled
elif method == "nlp":
# For NLP-based prediction, concatenate features into a single paragraph
claim_date = input_data['ClaimDate']
claim_amount = input_data['ClaimAmount']
patient_age = input_data['PatientAge']
patient_gender = input_data['PatientGender']
provider_specialty = input_data['ProviderSpecialty']
claim_status = input_data['ClaimStatus']
patient_income = input_data['PatientIncome']
patient_marital_status = input_data['PatientMaritalStatus']
patient_employment_status = input_data['PatientEmploymentStatus']
provider_location = input_data['ProviderLocation']
claim_type = input_data['ClaimType']
claim_submission_method = input_data['ClaimSubmissionMethod']
# Create a sentence (paragraph) using the input data
input_text = f"The claim date is {claim_date}, with a claim amount of {claim_amount}. " \
f"The patient is {patient_age} years old, and their gender is {patient_gender}. " \
f"The provider specialty is {provider_specialty}. The claim status is {claim_status}. " \
f"The patient's income is {patient_income}, marital status is {patient_marital_status}, " \
f"and employment status is {patient_employment_status}. The provider location is {provider_location}. " \
f"The claim type is {claim_type}, and the claim submission method is {claim_submission_method}. " \
f"Claim legitimacy: {input_data['ClaimLegitimacy']}."
# Tokenize the input text for NLP
inputs = nlp_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
return inputs
# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")
# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))
# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
# ClaimLegitimacy is excluded from input (it’s the target that we want to predict)
# claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])
# Create a button to trigger prediction
if st.button('Predict'):
input_data = {
"ClaimDate": claim_date,
"ClaimAmount": claim_amount,
"PatientAge": patient_age,
"PatientIncome": patient_income,
"PatientGender": patient_gender,
"ProviderSpecialty": provider_specialty,
"ClaimStatus": claim_status,
"PatientMaritalStatus": patient_marital_status,
"PatientEmploymentStatus": patient_employment_status,
"ProviderLocation": provider_location,
"ClaimType": claim_type,
"ClaimSubmissionMethod": claim_submission_method,
# "ClaimLegitimacy": claim_legitimacy, # Removed since it's the target we want to predict
}
# Preprocess the input data based on the selected method
if prediction_method == "ML Prediction":
input_scaled = preprocess_input(input_data, method="ml")
# Get the prediction from the ML model (Random Forest)
prediction = rf_model.predict(input_scaled)
if prediction == 1:
st.write("This claim is predicted to be **fraudulent** (ML model).")
else:
st.write("This claim is predicted to be **legitimate** (ML model).")
elif prediction_method == "NLP Prediction":
inputs = preprocess_input(input_data, method="nlp")
# Get the prediction from the NLP model (BERT)
with torch.no_grad():
logits = nlp_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=-1).item()
if predicted_class == 1:
st.write("This claim is predicted to be **fraudulent** (NLP model).")
else:
st.write("This claim is predicted to be **legitimate** (NLP model).")
# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")
# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))
# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])
# Create a button to trigger prediction
if st.button('Predict'):
input_data = {
"ClaimDate": claim_date,
"ClaimAmount": claim_amount,
"PatientAge": patient_age,
"PatientIncome": patient_income,
"PatientGender": patient_gender,
"ProviderSpecialty": provider_specialty,
"ClaimStatus": claim_status,
"PatientMaritalStatus": patient_marital_status,
"PatientEmploymentStatus": patient_employment_status,
"ProviderLocation": provider_location,
"ClaimType": claim_type,
"ClaimSubmissionMethod": claim_submission_method,
"ClaimLegitimacy": claim_legitimacy,
}
# Preprocess the input data based on the selected method
if prediction_method == "ML Prediction":
input_scaled = preprocess_input(input_data, method="ml")
# Get the prediction from the ML model (Random Forest)
prediction = rf_model.predict(input_scaled)
if prediction == 1:
st.write("This claim is predicted to be **fraudulent** (ML model).")
else:
st.write("This claim is predicted to be **legitimate** (ML model).")
elif prediction_method == "NLP Prediction":
inputs = preprocess_input(input_data, method="nlp")
# Get the prediction from the NLP model (BERT)
with torch.no_grad():
logits = nlp_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=-1).item()
if predicted_class == 1:
st.write("This claim is predicted to be **fraudulent** (NLP model).")
else:
st.write("This claim is predicted to be **legitimate** (NLP model).")