#new import streamlit as st import joblib import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from huggingface_hub import hf_hub_download from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch # Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model) nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP" # replace with your Hugging Face model path nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name) nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name) # Load the Random Forest model for ML-based prediction rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib") rf_model = joblib.load(rf_model_path) # Preprocessing for the user inputs # Preprocessing for the user inputs import streamlit as st import joblib import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from huggingface_hub import hf_hub_download from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch # Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model) nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP" # replace with your Hugging Face model path nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name) nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name) # Load the Random Forest model for ML-based prediction rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib") rf_model = joblib.load(rf_model_path) # Preprocessing for the user inputs def preprocess_input(input_data, method="ml"): if method == "ml": # For Random Forest prediction, apply necessary transformations like scaling or encoding. # Convert ClaimDate to ordinal (number of days since a particular date) input_data['ClaimDate'] = pd.to_datetime(input_data['ClaimDate']).dt.toordinal() # Wrap each feature value in a list to create a valid DataFrame input_df = pd.DataFrame({ 'ClaimDate': [input_data['ClaimDate']], # Now converted to ordinal value 'ClaimAmount': [input_data['ClaimAmount']], 'PatientAge': [input_data['PatientAge']], 'PatientIncome': [input_data['PatientIncome']], 'PatientGender': [input_data['PatientGender']], 'ProviderSpecialty': [input_data['ProviderSpecialty']], 'ClaimStatus': [input_data['ClaimStatus']], 'PatientMaritalStatus': [input_data['PatientMaritalStatus']], 'PatientEmploymentStatus': [input_data['PatientEmploymentStatus']], 'ProviderLocation': [input_data['ProviderLocation']], 'ClaimType': [input_data['ClaimType']], 'ClaimSubmissionMethod': [input_data['ClaimSubmissionMethod']], }) # Apply necessary preprocessing: Encoding and scaling (use the same scaler and encoders as in training) input_df['PatientGender'] = input_df['PatientGender'].apply(lambda x: 1 if x == 'Male' else 0) claim_status_mapping = {"Denied": 0, "Pending": 1, "Approved": 2} input_df['ClaimStatus'] = input_df['ClaimStatus'].map(claim_status_mapping) scaler = StandardScaler() input_scaled = scaler.fit_transform(input_df) # Scaling the data return input_scaled elif method == "nlp": # For NLP-based prediction, concatenate features into a single paragraph claim_date = input_data['ClaimDate'] claim_amount = input_data['ClaimAmount'] patient_age = input_data['PatientAge'] patient_gender = input_data['PatientGender'] provider_specialty = input_data['ProviderSpecialty'] claim_status = input_data['ClaimStatus'] patient_income = input_data['PatientIncome'] patient_marital_status = input_data['PatientMaritalStatus'] patient_employment_status = input_data['PatientEmploymentStatus'] provider_location = input_data['ProviderLocation'] claim_type = input_data['ClaimType'] claim_submission_method = input_data['ClaimSubmissionMethod'] # Create a sentence (paragraph) using the input data input_text = f"The claim date is {claim_date}, with a claim amount of {claim_amount}. " \ f"The patient is {patient_age} years old, and their gender is {patient_gender}. " \ f"The provider specialty is {provider_specialty}. The claim status is {claim_status}. " \ f"The patient's income is {patient_income}, marital status is {patient_marital_status}, " \ f"and employment status is {patient_employment_status}. The provider location is {provider_location}. " \ f"The claim type is {claim_type}, and the claim submission method is {claim_submission_method}. " \ f"Claim legitimacy: {input_data['ClaimLegitimacy']}." # Tokenize the input text for NLP inputs = nlp_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512) return inputs # Title and description for the app st.title("Insurance Claim Fraud Detection") st.write(""" This app predicts whether an insurance claim is fraudulent or legitimate based on user input. You can choose between **ML-based prediction** or **NLP-based prediction**. """) # Buttons to select prediction method prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction")) # Input fields for the user (these should match your model features) claim_date = st.date_input("Enter the claim date") claim_amount = st.number_input("Enter the claim amount", min_value=0) patient_age = st.number_input("Enter the patient's age", min_value=0) patient_income = st.number_input("Enter the patient's income", min_value=0) patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"]) provider_specialty = st.text_input("Enter the provider specialty") claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"]) patient_marital_status = st.text_input("Enter the marital status") patient_employment_status = st.text_input("Enter the employment status") provider_location = st.text_input("Enter the provider location") claim_type = st.text_input("Enter the claim type") claim_submission_method = st.text_input("Enter the claim submission method") # ClaimLegitimacy is excluded from input (it’s the target that we want to predict) # claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"]) # Create a button to trigger prediction if st.button('Predict'): input_data = { "ClaimDate": claim_date, "ClaimAmount": claim_amount, "PatientAge": patient_age, "PatientIncome": patient_income, "PatientGender": patient_gender, "ProviderSpecialty": provider_specialty, "ClaimStatus": claim_status, "PatientMaritalStatus": patient_marital_status, "PatientEmploymentStatus": patient_employment_status, "ProviderLocation": provider_location, "ClaimType": claim_type, "ClaimSubmissionMethod": claim_submission_method, # "ClaimLegitimacy": claim_legitimacy, # Removed since it's the target we want to predict } # Preprocess the input data based on the selected method if prediction_method == "ML Prediction": input_scaled = preprocess_input(input_data, method="ml") # Get the prediction from the ML model (Random Forest) prediction = rf_model.predict(input_scaled) if prediction == 1: st.write("This claim is predicted to be **fraudulent** (ML model).") else: st.write("This claim is predicted to be **legitimate** (ML model).") elif prediction_method == "NLP Prediction": inputs = preprocess_input(input_data, method="nlp") # Get the prediction from the NLP model (BERT) with torch.no_grad(): logits = nlp_model(**inputs).logits predicted_class = torch.argmax(logits, dim=-1).item() if predicted_class == 1: st.write("This claim is predicted to be **fraudulent** (NLP model).") else: st.write("This claim is predicted to be **legitimate** (NLP model).") # Title and description for the app st.title("Insurance Claim Fraud Detection") st.write(""" This app predicts whether an insurance claim is fraudulent or legitimate based on user input. You can choose between **ML-based prediction** or **NLP-based prediction**. """) # Buttons to select prediction method prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction")) # Input fields for the user (these should match your model features) claim_date = st.date_input("Enter the claim date") claim_amount = st.number_input("Enter the claim amount", min_value=0) patient_age = st.number_input("Enter the patient's age", min_value=0) patient_income = st.number_input("Enter the patient's income", min_value=0) patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"]) provider_specialty = st.text_input("Enter the provider specialty") claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"]) patient_marital_status = st.text_input("Enter the marital status") patient_employment_status = st.text_input("Enter the employment status") provider_location = st.text_input("Enter the provider location") claim_type = st.text_input("Enter the claim type") claim_submission_method = st.text_input("Enter the claim submission method") claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"]) # Create a button to trigger prediction if st.button('Predict'): input_data = { "ClaimDate": claim_date, "ClaimAmount": claim_amount, "PatientAge": patient_age, "PatientIncome": patient_income, "PatientGender": patient_gender, "ProviderSpecialty": provider_specialty, "ClaimStatus": claim_status, "PatientMaritalStatus": patient_marital_status, "PatientEmploymentStatus": patient_employment_status, "ProviderLocation": provider_location, "ClaimType": claim_type, "ClaimSubmissionMethod": claim_submission_method, "ClaimLegitimacy": claim_legitimacy, } # Preprocess the input data based on the selected method if prediction_method == "ML Prediction": input_scaled = preprocess_input(input_data, method="ml") # Get the prediction from the ML model (Random Forest) prediction = rf_model.predict(input_scaled) if prediction == 1: st.write("This claim is predicted to be **fraudulent** (ML model).") else: st.write("This claim is predicted to be **legitimate** (ML model).") elif prediction_method == "NLP Prediction": inputs = preprocess_input(input_data, method="nlp") # Get the prediction from the NLP model (BERT) with torch.no_grad(): logits = nlp_model(**inputs).logits predicted_class = torch.argmax(logits, dim=-1).item() if predicted_class == 1: st.write("This claim is predicted to be **fraudulent** (NLP model).") else: st.write("This claim is predicted to be **legitimate** (NLP model).")