File size: 11,641 Bytes
1c02bff
e7e9ece
 
 
 
 
 
 
 
 
 
79277df
e7e9ece
 
 
 
 
 
 
3b01c14
2393287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e9ece
 
 
 
c20ddd8
2393287
c20ddd8
3b01c14
 
c20ddd8
 
 
 
 
 
 
 
 
 
 
 
3b01c14
 
c20ddd8
e7e9ece
 
 
 
 
 
 
 
2393287
e7e9ece
6d0e637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e9ece
 
6d0e637
2393287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b01c14
c20ddd8
e7e9ece
 
 
 
 
 
 
 
 
 
 
6d0e637
e7e9ece
 
 
 
6d0e637
e7e9ece
6d0e637
 
 
 
 
 
e7e9ece
6d0e637
e7e9ece
6d0e637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e9ece
6d0e637
 
 
e7e9ece
 
 
 
 
 
 
 
 
 
6d0e637
e7e9ece
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#new
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP"  # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)

# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)

# Preprocessing for the user inputs
# Preprocessing for the user inputs
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load your NLP model (Hugging Face model) for fraud prediction using BERT (or similar NLP model)
nlp_model_name = "tajuarAkash/Health_Insurance_Fraud_detection_using_NLP"  # replace with your Hugging Face model path
nlp_tokenizer = AutoTokenizer.from_pretrained(nlp_model_name)
nlp_model = AutoModelForSequenceClassification.from_pretrained(nlp_model_name)

# Load the Random Forest model for ML-based prediction
rf_model_path = hf_hub_download(repo_id="tajuarAkash/Health_Insurance_Fraud_detection_using_Random_forest", filename="random_forest_model.joblib")
rf_model = joblib.load(rf_model_path)

# Preprocessing for the user inputs
def preprocess_input(input_data, method="ml"):
    if method == "ml":
        # For Random Forest prediction, apply necessary transformations like scaling or encoding.
        # Convert ClaimDate to ordinal (number of days since a particular date)
        input_data['ClaimDate'] = pd.to_datetime(input_data['ClaimDate']).dt.toordinal()

        # Wrap each feature value in a list to create a valid DataFrame
        input_df = pd.DataFrame({
            'ClaimDate': [input_data['ClaimDate']],  # Now converted to ordinal value
            'ClaimAmount': [input_data['ClaimAmount']],
            'PatientAge': [input_data['PatientAge']],
            'PatientIncome': [input_data['PatientIncome']],
            'PatientGender': [input_data['PatientGender']],
            'ProviderSpecialty': [input_data['ProviderSpecialty']],
            'ClaimStatus': [input_data['ClaimStatus']],
            'PatientMaritalStatus': [input_data['PatientMaritalStatus']],
            'PatientEmploymentStatus': [input_data['PatientEmploymentStatus']],
            'ProviderLocation': [input_data['ProviderLocation']],
            'ClaimType': [input_data['ClaimType']],
            'ClaimSubmissionMethod': [input_data['ClaimSubmissionMethod']],
        })

        # Apply necessary preprocessing: Encoding and scaling (use the same scaler and encoders as in training)
        input_df['PatientGender'] = input_df['PatientGender'].apply(lambda x: 1 if x == 'Male' else 0)
        claim_status_mapping = {"Denied": 0, "Pending": 1, "Approved": 2}
        input_df['ClaimStatus'] = input_df['ClaimStatus'].map(claim_status_mapping)

        scaler = StandardScaler()
        input_scaled = scaler.fit_transform(input_df)  # Scaling the data

        return input_scaled

    elif method == "nlp":
        # For NLP-based prediction, concatenate features into a single paragraph
        claim_date = input_data['ClaimDate']
        claim_amount = input_data['ClaimAmount']
        patient_age = input_data['PatientAge']
        patient_gender = input_data['PatientGender']
        provider_specialty = input_data['ProviderSpecialty']
        claim_status = input_data['ClaimStatus']
        patient_income = input_data['PatientIncome']
        patient_marital_status = input_data['PatientMaritalStatus']
        patient_employment_status = input_data['PatientEmploymentStatus']
        provider_location = input_data['ProviderLocation']
        claim_type = input_data['ClaimType']
        claim_submission_method = input_data['ClaimSubmissionMethod']

        # Create a sentence (paragraph) using the input data
        input_text = f"The claim date is {claim_date}, with a claim amount of {claim_amount}. " \
                     f"The patient is {patient_age} years old, and their gender is {patient_gender}. " \
                     f"The provider specialty is {provider_specialty}. The claim status is {claim_status}. " \
                     f"The patient's income is {patient_income}, marital status is {patient_marital_status}, " \
                     f"and employment status is {patient_employment_status}. The provider location is {provider_location}. " \
                     f"The claim type is {claim_type}, and the claim submission method is {claim_submission_method}. " \
                     f"Claim legitimacy: {input_data['ClaimLegitimacy']}."
        
        # Tokenize the input text for NLP
        inputs = nlp_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        return inputs


# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")

# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))

# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
# ClaimLegitimacy is excluded from input (it’s the target that we want to predict)
# claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])

# Create a button to trigger prediction
if st.button('Predict'):
    input_data = {
        "ClaimDate": claim_date,
        "ClaimAmount": claim_amount,
        "PatientAge": patient_age,
        "PatientIncome": patient_income,
        "PatientGender": patient_gender,
        "ProviderSpecialty": provider_specialty,
        "ClaimStatus": claim_status,
        "PatientMaritalStatus": patient_marital_status,
        "PatientEmploymentStatus": patient_employment_status,
        "ProviderLocation": provider_location,
        "ClaimType": claim_type,
        "ClaimSubmissionMethod": claim_submission_method,
        # "ClaimLegitimacy": claim_legitimacy,  # Removed since it's the target we want to predict
    }

    # Preprocess the input data based on the selected method
    if prediction_method == "ML Prediction":
        input_scaled = preprocess_input(input_data, method="ml")
        
        # Get the prediction from the ML model (Random Forest)
        prediction = rf_model.predict(input_scaled)
        
        if prediction == 1:
            st.write("This claim is predicted to be **fraudulent** (ML model).")
        else:
            st.write("This claim is predicted to be **legitimate** (ML model).")
    
    elif prediction_method == "NLP Prediction":
        inputs = preprocess_input(input_data, method="nlp")

        # Get the prediction from the NLP model (BERT)
        with torch.no_grad():
            logits = nlp_model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=-1).item()

        if predicted_class == 1:
            st.write("This claim is predicted to be **fraudulent** (NLP model).")
        else:
            st.write("This claim is predicted to be **legitimate** (NLP model).")





# Title and description for the app
st.title("Insurance Claim Fraud Detection")
st.write("""
This app predicts whether an insurance claim is fraudulent or legitimate based on user input.
You can choose between **ML-based prediction** or **NLP-based prediction**.
""")

# Buttons to select prediction method
prediction_method = st.radio("Choose Prediction Method", ("ML Prediction", "NLP Prediction"))

# Input fields for the user (these should match your model features)
claim_date = st.date_input("Enter the claim date")
claim_amount = st.number_input("Enter the claim amount", min_value=0)
patient_age = st.number_input("Enter the patient's age", min_value=0)
patient_income = st.number_input("Enter the patient's income", min_value=0)
patient_gender = st.selectbox("Select patient's gender", ["Male", "Female"])
provider_specialty = st.text_input("Enter the provider specialty")
claim_status = st.selectbox("Claim status", ["Denied", "Pending", "Approved"])
patient_marital_status = st.text_input("Enter the marital status")
patient_employment_status = st.text_input("Enter the employment status")
provider_location = st.text_input("Enter the provider location")
claim_type = st.text_input("Enter the claim type")
claim_submission_method = st.text_input("Enter the claim submission method")
claim_legitimacy = st.selectbox("Claim legitimacy", ["Fraud", "Legitimate"])

# Create a button to trigger prediction
if st.button('Predict'):
    input_data = {
        "ClaimDate": claim_date,
        "ClaimAmount": claim_amount,
        "PatientAge": patient_age,
        "PatientIncome": patient_income,
        "PatientGender": patient_gender,
        "ProviderSpecialty": provider_specialty,
        "ClaimStatus": claim_status,
        "PatientMaritalStatus": patient_marital_status,
        "PatientEmploymentStatus": patient_employment_status,
        "ProviderLocation": provider_location,
        "ClaimType": claim_type,
        "ClaimSubmissionMethod": claim_submission_method,
        "ClaimLegitimacy": claim_legitimacy,
    }

    # Preprocess the input data based on the selected method
    if prediction_method == "ML Prediction":
        input_scaled = preprocess_input(input_data, method="ml")
        
        # Get the prediction from the ML model (Random Forest)
        prediction = rf_model.predict(input_scaled)
        
        if prediction == 1:
            st.write("This claim is predicted to be **fraudulent** (ML model).")
        else:
            st.write("This claim is predicted to be **legitimate** (ML model).")
    
    elif prediction_method == "NLP Prediction":
        inputs = preprocess_input(input_data, method="nlp")

        # Get the prediction from the NLP model (BERT)
        with torch.no_grad():
            logits = nlp_model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=-1).item()

        if predicted_class == 1:
            st.write("This claim is predicted to be **fraudulent** (NLP model).")
        else:
            st.write("This claim is predicted to be **legitimate** (NLP model).")