{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# This is fraud Detection Application\n", "### This model uses Random Forest Algorithim for Fraud Classification\n", "#### This model utilizes dataset from kaggle" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### This Model is comprised of the following steps:\n", "\n", "1. Library Imports\n", "2. Data Loading\n", "3. Data Preprocessing\n", "4. Model Training\n", "5. Class Imbalance Handling\n", "6. Model Export" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Library Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from datasets import load_dataset\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay\n", "from sklearn.model_selection import train_test_split\n", "import joblib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Load" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dataset = load_dataset(\"Nooha/cc_fraud_detection_dataset\")\n", "df = pd.DataFrame(dataset['train'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset Preview:\n", " ssn cc_num first last gender city state \\\n", "0 367-85-9826 4361337605230458 Kristie Davis F Chandler OK \n", "1 367-85-9826 4361337605230458 Kristie Davis F Chandler OK \n", "2 367-85-9826 4361337605230458 Kristie Davis F Chandler OK \n", "3 367-85-9826 4361337605230458 Kristie Davis F Chandler OK \n", "4 367-85-9826 4361337605230458 Kristie Davis F Chandler OK \n", "\n", " zip city_pop job dob acct_num \\\n", "0 74834 7590 Chief Strategy Officer 1987-06-12 349734538563 \n", "1 74834 7590 Chief Strategy Officer 1987-06-12 349734538563 \n", "2 74834 7590 Chief Strategy Officer 1987-06-12 349734538563 \n", "3 74834 7590 Chief Strategy Officer 1987-06-12 349734538563 \n", "4 74834 7590 Chief Strategy Officer 1987-06-12 349734538563 \n", "\n", " trans_num trans_date trans_time unix_time \\\n", "0 c036244703adb9d5392f4027d9d4b38d 2021-07-31 02:30:01 1627678801 \n", "1 42f000b0b3b0ef534e5b8ef9ec1db13a 2021-08-01 22:37:41 1627837661 \n", "2 543037b1baf088961e58d00b705f4bcc 2021-08-01 23:02:09 1627839129 \n", "3 00a4e08643edebf9277c2967676f6a26 2021-08-01 22:27:24 1627837044 \n", "4 492c4412815306718f686fc5b459a285 2021-12-02 02:28:51 1638392331 \n", "\n", " category amt is_fraud merchant \n", "0 grocery_pos 337.54 1 fraud_Kovacek \n", "1 personal_care 21.13 1 fraud_Bradtke \n", "2 personal_care 22.61 1 fraud_Kozey-Kuhlman \n", "3 health_fitness 17.32 1 fraud_Hills \n", "4 misc_pos 75.82 0 fraud_Kemmer-Buckridge \n" ] } ], "source": [ "# Display the first few rows of the dataset\n", "print(\"Dataset Preview:\")\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Dataset Information:\n", "\n", "RangeIndex: 2646694 entries, 0 to 2646693\n", "Data columns (total 20 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ssn object \n", " 1 cc_num int64 \n", " 2 first object \n", " 3 last object \n", " 4 gender object \n", " 5 city object \n", " 6 state object \n", " 7 zip int64 \n", " 8 city_pop int64 \n", " 9 job object \n", " 10 dob object \n", " 11 acct_num int64 \n", " 12 trans_num object \n", " 13 trans_date object \n", " 14 trans_time object \n", " 15 unix_time int64 \n", " 16 category object \n", " 17 amt float64\n", " 18 is_fraud int64 \n", " 19 merchant object \n", "dtypes: float64(1), int64(6), object(13)\n", "memory usage: 403.9+ MB\n", "None\n" ] } ], "source": [ "# Display dataset information\n", "print(\"\\nDataset Information:\")\n", "print(df.info())\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Missing Values:\n", "ssn 0\n", "cc_num 0\n", "first 0\n", "last 0\n", "gender 0\n", "city 0\n", "state 0\n", "zip 0\n", "city_pop 0\n", "job 0\n", "dob 0\n", "acct_num 0\n", "trans_num 0\n", "trans_date 0\n", "trans_time 0\n", "unix_time 0\n", "category 0\n", "amt 0\n", "is_fraud 0\n", "merchant 0\n", "dtype: int64\n" ] } ], "source": [ "# Check for missing values\n", "print(\"Missing Values:\")\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Drop non-numeric columns (if any)\n", "numeric_df = df.select_dtypes(include=['number'])\n", "\n", "# Ensure the target column 'is_fraud' is included\n", "if 'is_fraud' not in numeric_df.columns:\n", " numeric_df['is_fraud'] = df['is_fraud']\n", "\n", "# Separate features and target\n", "X = numeric_df.drop(columns=['is_fraud'])\n", "y = numeric_df['is_fraud']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Shape of Features (X): (2646694, 6)\n", "Shape of Target (y): (2646694,)\n" ] } ], "source": [ "# Display the shape of the dataset\n", "print(\"\\nShape of Features (X):\", X.shape)\n", "print(\"Shape of Target (y):\", y.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature Scaling" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Scaled Features:\n", "[[-0.31022966 0.75530067 -0.4848491 -0.49208358 -1.12618154 1.60692892]\n", " [-0.31022966 0.75530067 -0.4848491 -0.49208358 -1.12037479 -0.29432497]\n", " [-0.31022966 0.75530067 -0.4848491 -0.49208358 -1.12032113 -0.2854319 ]\n", " [-0.31022966 0.75530067 -0.4848491 -0.49208358 -1.12039735 -0.31721862]\n", " [-0.31022966 0.75530067 -0.4848491 -0.49208358 -0.73457409 0.03429794]]\n" ] } ], "source": [ "# Initialize the scaler\n", "scaler = StandardScaler()\n", "\n", "# Scale the features\n", "X_scaled = scaler.fit_transform(X)\n", "\n", "print(\"Scaled Features:\")\n", "print(X_scaled[:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Splitting" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Split the dataset\n", "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of X_train: (2117355, 6)\n", "Shape of X_test: (529339, 6)\n", "Shape of y_train: (2117355,)\n", "Shape of y_test: (529339,)\n" ] } ], "source": [ "# Display the shape of the splits\n", "print(\"Shape of X_train:\", X_train.shape)\n", "print(\"Shape of X_test:\", X_test.shape)\n", "print(\"Shape of y_train:\", y_train.shape)\n", "print(\"Shape of y_test:\", y_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Training" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model training completed!\n" ] } ], "source": [ "# Initialize the Random Forest model\n", "model = RandomForestClassifier(\n", " n_estimators=100,\n", " max_depth=10,\n", " random_state=42,\n", " class_weight='balanced' # Handle class imbalance\n", ")\n", "\n", "# Train the model\n", "model.fit(X_train, y_train)\n", "\n", "# Display training completion message\n", "print(\"Model training completed!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Evaluation" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Make predictions\n", "y_pred = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9615\n" ] } ], "source": [ "# Display accuracy\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy: {accuracy:.4f}\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Not Fraud 1.00 0.96 0.98 527441\n", " Fraud 0.07 0.82 0.13 1898\n", "\n", " accuracy 0.96 529339\n", " macro avg 0.54 0.89 0.56 529339\n", "weighted avg 1.00 0.96 0.98 529339\n", "\n" ] } ], "source": [ "# Display classification report\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud']))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Display confusion matrix\n", "cm = confusion_matrix(y_test, y_pred)\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Fraud', 'Fraud'])\n", "disp.plot(cmap=plt.cm.Blues)\n", "plt.title('Confusion Matrix')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Export\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model and scaler saved successfully!\n" ] } ], "source": [ "# Save the model\n", "joblib.dump(model, 'cc_fraud_model.pkl')\n", "\n", "# Save the scaler\n", "joblib.dump(scaler, 'cc_fraud_scaler.pkl')\n", "\n", "print(\"Model and scaler saved successfully!\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }