add tings
Browse files- sml/cc_features.py +146 -0
- sml/synthetic_data.py +419 -0
sml/cc_features.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from datetime import datetime, date
|
5 |
+
from math import radians
|
6 |
+
|
7 |
+
# +
|
8 |
+
def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
|
9 |
+
"""Used only in feature pipelines (not online inference).
|
10 |
+
Unit test with DataFrames and sample data.
|
11 |
+
"""
|
12 |
+
age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
|
13 |
+
trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
|
14 |
+
return trans_df
|
15 |
+
|
16 |
+
def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
|
17 |
+
"""Used only in feature pipelines (not online inference).
|
18 |
+
Unit test with DataFrames and sample data.
|
19 |
+
"""
|
20 |
+
card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
|
21 |
+
card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
|
22 |
+
trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
|
23 |
+
return trans_df
|
24 |
+
|
25 |
+
|
26 |
+
# -
|
27 |
+
|
28 |
+
def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
|
29 |
+
"""Compute Haversine distance between each consecutive coordinate in (long, lat)."""
|
30 |
+
|
31 |
+
if isinstance(long, pd.Series):
|
32 |
+
long = long.map(lambda x: (x))
|
33 |
+
else:
|
34 |
+
long = radians(long)
|
35 |
+
|
36 |
+
if isinstance(lat, pd.Series):
|
37 |
+
lat = lat.map(lambda x: (x))
|
38 |
+
else:
|
39 |
+
lat = radians(lat)
|
40 |
+
|
41 |
+
if isinstance(long, pd.Series):
|
42 |
+
prev_long = prev_long.map(lambda x: (x))
|
43 |
+
else:
|
44 |
+
prev_long = radians(prev_long)
|
45 |
+
|
46 |
+
if isinstance(lat, pd.Series):
|
47 |
+
prev_lat = prev_lat.map(lambda x: (x))
|
48 |
+
else:
|
49 |
+
prev_lat = radians(prev_lat)
|
50 |
+
|
51 |
+
long_diff = prev_long - long
|
52 |
+
lat_diff = prev_lat - lat
|
53 |
+
|
54 |
+
a = np.sin(lat_diff/2.0)**2
|
55 |
+
b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
|
56 |
+
c = 2*np.arcsin(np.sqrt(a + b))
|
57 |
+
|
58 |
+
return c
|
59 |
+
|
60 |
+
|
61 |
+
def time_delta(prev_datetime: int, current_datetime: int)-> int:
|
62 |
+
"""Compute time difference between each consecutive transaction."""
|
63 |
+
return prev_datetime - current_datetime
|
64 |
+
|
65 |
+
def time_delta_to_days(time_delta: datetime)-> float:
|
66 |
+
"""."""
|
67 |
+
return time_delta.total_seconds() / 86400
|
68 |
+
|
69 |
+
def date_to_timestamp(date_obj: datetime)-> int:
|
70 |
+
return int(date_obj.timestamp() * 1000)
|
71 |
+
|
72 |
+
def timestamp_to_date(timestamp: int)-> datetime:
|
73 |
+
return datetime.fromtimestamp(timestamp // 1000)
|
74 |
+
|
75 |
+
def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
|
76 |
+
|
77 |
+
# Convert coordinates into radians:
|
78 |
+
trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
|
79 |
+
|
80 |
+
trans_df.sort_values(["datetime", "cc_num"], inplace=True)
|
81 |
+
|
82 |
+
# When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
|
83 |
+
# recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
|
84 |
+
# we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
|
85 |
+
trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
|
86 |
+
.apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
|
87 |
+
.reset_index(level=0, drop=True)\
|
88 |
+
.fillna(0)
|
89 |
+
|
90 |
+
# Use the same `shift` operation in Pandas to get the previous row for a given cc_number
|
91 |
+
trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
|
92 |
+
.apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
|
93 |
+
.reset_index(level=0, drop=True)
|
94 |
+
# .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
|
95 |
+
|
96 |
+
# Convert time_delta from seconds to days
|
97 |
+
trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
|
98 |
+
trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)
|
99 |
+
trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\
|
100 |
+
,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
|
101 |
+
# Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
|
102 |
+
# on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
|
103 |
+
trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
|
104 |
+
return trans_df
|
105 |
+
|
106 |
+
|
107 |
+
def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
|
108 |
+
|
109 |
+
cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
|
110 |
+
|
111 |
+
# Moving average of transaction volume.
|
112 |
+
df_mavg = pd.DataFrame(cc_group.mean())
|
113 |
+
df_mavg.columns = ["trans_volume_mavg", "datetime"]
|
114 |
+
df_mavg = df_mavg.reset_index(level=["cc_num"])
|
115 |
+
df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
|
116 |
+
df_mavg = df_mavg.sort_index()
|
117 |
+
|
118 |
+
# Moving standard deviation of transaction volume.
|
119 |
+
df_std = pd.DataFrame(cc_group.mean())
|
120 |
+
df_std.columns = ["trans_volume_mstd", "datetime"]
|
121 |
+
df_std = df_std.reset_index(level=["cc_num"])
|
122 |
+
df_std = df_std.drop(columns=["cc_num", "datetime"])
|
123 |
+
df_std = df_std.fillna(0)
|
124 |
+
df_std = df_std.sort_index()
|
125 |
+
window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
|
126 |
+
|
127 |
+
# Moving average of transaction frequency.
|
128 |
+
df_count = pd.DataFrame(cc_group.mean())
|
129 |
+
df_count.columns = ["trans_freq", "datetime"]
|
130 |
+
df_count = df_count.reset_index(level=["cc_num"])
|
131 |
+
df_count = df_count.drop(columns=["cc_num", "datetime"])
|
132 |
+
df_count = df_count.sort_index()
|
133 |
+
window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
|
134 |
+
|
135 |
+
# Moving average of location difference between consecutive transactions.
|
136 |
+
cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
|
137 |
+
df_loc_delta_mavg = pd.DataFrame(cc_group)
|
138 |
+
df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
|
139 |
+
df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
|
140 |
+
df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
|
141 |
+
df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
|
142 |
+
window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
|
143 |
+
|
144 |
+
window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
|
145 |
+
|
146 |
+
return window_aggs_df
|
sml/synthetic_data.py
ADDED
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
# %%
|
4 |
+
from collections import defaultdict
|
5 |
+
from faker import Faker
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
import datetime
|
9 |
+
import hashlib
|
10 |
+
import random
|
11 |
+
import math
|
12 |
+
import os
|
13 |
+
import bisect
|
14 |
+
from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
|
15 |
+
|
16 |
+
# Seed for Reproducibility
|
17 |
+
faker = Faker()
|
18 |
+
faker.seed_locale('en_US', 0)
|
19 |
+
|
20 |
+
|
21 |
+
def set_random_seed(seed: int):
|
22 |
+
random.seed(seed)
|
23 |
+
np.random.seed(seed)
|
24 |
+
faker.seed_instance(seed)
|
25 |
+
|
26 |
+
set_random_seed(12345)
|
27 |
+
|
28 |
+
|
29 |
+
TOTAL_UNIQUE_USERS = 1000
|
30 |
+
TOTAL_UNIQUE_TRANSACTIONS = 54000
|
31 |
+
CASH_WITHRAWAL_CARDS_TOTAL = 2000
|
32 |
+
TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200
|
33 |
+
ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
|
34 |
+
NORMAL_ATM_RADIUS = 0.01
|
35 |
+
START_DATE = '2022-01-01 00:00:00'
|
36 |
+
END_DATE = '2022-03-01 00:00:00'
|
37 |
+
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
|
38 |
+
|
39 |
+
AMOUNT_DISTRIBUTION_PERCENTAGES = {
|
40 |
+
0.05: (0.01, 1.01),
|
41 |
+
0.075: (1, 11.01),
|
42 |
+
0.525: (10, 100.01),
|
43 |
+
0.25: (100, 1000.01),
|
44 |
+
0.099: (1000, 10000.01),
|
45 |
+
0.001: (10000, 30000.01)
|
46 |
+
}
|
47 |
+
|
48 |
+
CATEGORY_PERC_PRICE = {
|
49 |
+
"Grocery": (0.5, 0.01, 100),
|
50 |
+
"Restaurant/Cafeteria": (0.2, 1, 100),
|
51 |
+
"Health/Beauty": (0.1, 10, 500.01),
|
52 |
+
"Domestic Transport": (0.1, 10, 100.01),
|
53 |
+
"Clothing": (0.05, 10, 2000.01),
|
54 |
+
"Electronics": (0.02, 100, 10000.01),
|
55 |
+
"Sports/Outdoors": (0.015, 10, 100.01),
|
56 |
+
"Holliday/Travel": (0.014, 10, 100.01),
|
57 |
+
"Jewelery": (0.001, 10, 100.01)
|
58 |
+
}
|
59 |
+
|
60 |
+
FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
|
61 |
+
NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
|
62 |
+
ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
|
63 |
+
|
64 |
+
SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
|
65 |
+
0.055: (17, 24),
|
66 |
+
0.0015: (24, 34),
|
67 |
+
0.0015: (34, 44),
|
68 |
+
0.02: (44, 54),
|
69 |
+
0.022: (54, 64),
|
70 |
+
0.1: (64, 74),
|
71 |
+
0.40: (74, 84),
|
72 |
+
0.40: (84, 100),
|
73 |
+
}
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
def generate_unique_credit_card_numbers(n: int) -> pd.Series:
|
78 |
+
"""."""
|
79 |
+
cc_ids = set()
|
80 |
+
for _ in range(n):
|
81 |
+
cc_id = faker.credit_card_number(card_type='visa')
|
82 |
+
cc_ids.add(cc_id)
|
83 |
+
return pd.Series(list(cc_ids))
|
84 |
+
|
85 |
+
# write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS
|
86 |
+
# assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
|
87 |
+
|
88 |
+
def generate_list_credit_card_numbers() -> list:
|
89 |
+
"""."""
|
90 |
+
credit_cards = []
|
91 |
+
credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
|
92 |
+
delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
|
93 |
+
delta_time_object + datetime.timedelta(days=-728)
|
94 |
+
for cc_num in credit_card_numbers:
|
95 |
+
credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})
|
96 |
+
return credit_cards
|
97 |
+
|
98 |
+
def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
|
99 |
+
"""."""
|
100 |
+
profiles = []
|
101 |
+
for credit_card in credit_cards:
|
102 |
+
address = faker.local_latlng(country_code = 'US')
|
103 |
+
age = 0
|
104 |
+
profile = None
|
105 |
+
while age < 18 or age > 100:
|
106 |
+
profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
|
107 |
+
dday = profile['birthdate']
|
108 |
+
delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
|
109 |
+
age = int(delta.days / 365)
|
110 |
+
profile['City'] = address[2]
|
111 |
+
profile['Country'] = address[3]
|
112 |
+
profile['cc_num'] = credit_card['cc_num']
|
113 |
+
credit_card['age'] = age
|
114 |
+
profiles.append(profile)
|
115 |
+
|
116 |
+
# Cast the columns to the correct Pandas DType
|
117 |
+
profiles_df = pd.DataFrame.from_records(profiles)
|
118 |
+
profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
|
119 |
+
profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
|
120 |
+
|
121 |
+
return profiles_df
|
122 |
+
|
123 |
+
# pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
|
124 |
+
def generate_timestamps(n: int) -> list:
|
125 |
+
"""Return a list of timestamps of length 'n'."""
|
126 |
+
start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
|
127 |
+
end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
|
128 |
+
timestamps = list()
|
129 |
+
for _ in range(n):
|
130 |
+
timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
|
131 |
+
timestamps.append(timestamp)
|
132 |
+
timestamps = sorted(timestamps)
|
133 |
+
return timestamps
|
134 |
+
|
135 |
+
def get_random_transaction_amount(start: float, end: float) -> float:
|
136 |
+
"""."""
|
137 |
+
amt = round(np.random.uniform(start, end), 2)
|
138 |
+
return amt
|
139 |
+
|
140 |
+
def generate_amounts() -> list:
|
141 |
+
"""."""
|
142 |
+
amounts = []
|
143 |
+
for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
|
144 |
+
n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
|
145 |
+
start, end = span
|
146 |
+
for _ in range(n):
|
147 |
+
amounts.append(get_random_transaction_amount(start, end+1))
|
148 |
+
return amounts
|
149 |
+
|
150 |
+
def generate_categories(amounts) -> list:
|
151 |
+
"""."""
|
152 |
+
categories = []
|
153 |
+
for category, category_perc_price in CATEGORY_PERC_PRICE.items():
|
154 |
+
percentage, min_price, max_price = category_perc_price
|
155 |
+
n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
|
156 |
+
for _ in range(n):
|
157 |
+
min_price_i = bisect.bisect_left(amounts, min_price)
|
158 |
+
max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
|
159 |
+
categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
|
160 |
+
|
161 |
+
random.shuffle(categories)
|
162 |
+
return categories
|
163 |
+
|
164 |
+
def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
|
165 |
+
"""."""
|
166 |
+
hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
|
167 |
+
hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
|
168 |
+
return hexdigest
|
169 |
+
|
170 |
+
def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
|
171 |
+
"""."""
|
172 |
+
transactions = []
|
173 |
+
for timestamp, category in zip(timestamps, categories):
|
174 |
+
credit_card_number = random.choice(credit_card_numbers)
|
175 |
+
point_of_tr = faker.local_latlng(country_code = 'US')
|
176 |
+
transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
|
177 |
+
transactions.append({
|
178 |
+
'tid': transaction_id,
|
179 |
+
'datetime': timestamp,
|
180 |
+
'cc_num': credit_card_number,
|
181 |
+
'category': category['category'],
|
182 |
+
'amount': category['amount'],
|
183 |
+
'latitude': point_of_tr[0],
|
184 |
+
'longitude': point_of_tr[1],
|
185 |
+
'city': point_of_tr[2],
|
186 |
+
'country': point_of_tr[3],
|
187 |
+
'fraud_label': 0
|
188 |
+
}
|
189 |
+
)
|
190 |
+
return transactions
|
191 |
+
|
192 |
+
def generate_cash_amounts() -> list:
|
193 |
+
"""."""
|
194 |
+
cash_amounts = []
|
195 |
+
for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
|
196 |
+
n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
|
197 |
+
start, end = span
|
198 |
+
for _ in range(n):
|
199 |
+
cash_amounts.append(get_random_transaction_amount(start, end+1))
|
200 |
+
return cash_amounts
|
201 |
+
|
202 |
+
def generate_chains():
|
203 |
+
"""."""
|
204 |
+
visited = set()
|
205 |
+
chains = defaultdict(list)
|
206 |
+
|
207 |
+
def size(chains: dict) -> int:
|
208 |
+
counts = {key: len(values)+1 for (key, values) in chains.items()}
|
209 |
+
return sum(counts.values())
|
210 |
+
|
211 |
+
|
212 |
+
def generate_attack_chain(i: int):
|
213 |
+
chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
|
214 |
+
for j in range(1, chain_length):
|
215 |
+
if i+j not in visited:
|
216 |
+
if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
|
217 |
+
break
|
218 |
+
chains[i].append(i+j)
|
219 |
+
visited.add(i+j)
|
220 |
+
|
221 |
+
while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
|
222 |
+
i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
|
223 |
+
if i not in visited:
|
224 |
+
generate_attack_chain(i)
|
225 |
+
visited.add(i)
|
226 |
+
return chains
|
227 |
+
|
228 |
+
def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
|
229 |
+
delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
|
230 |
+
"""."""
|
231 |
+
atms = []
|
232 |
+
if length < 0:
|
233 |
+
raise Exception('Length must be > 0')
|
234 |
+
|
235 |
+
start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
|
236 |
+
end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
|
237 |
+
timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
|
238 |
+
point_of_tr = faker.local_latlng(country_code = country_code)
|
239 |
+
latitude = point_of_tr[0]
|
240 |
+
longitude = point_of_tr[1]
|
241 |
+
city = point_of_tr[2]
|
242 |
+
for _ in range(length):
|
243 |
+
current = timestamp + datetime.timedelta(hours=delta)
|
244 |
+
if radius is not None:
|
245 |
+
latitude = faker.coordinate(latitude, radius)
|
246 |
+
longitude = faker.coordinate(longitude, radius)
|
247 |
+
amount = random.sample(cash_amounts, 1)[0]
|
248 |
+
transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
|
249 |
+
atms.append({'tid': transaction_id,
|
250 |
+
'datetime': current.strftime(DATE_FORMAT),
|
251 |
+
'cc_num': credit_card_number,
|
252 |
+
'category': 'Cash Withdrawal',
|
253 |
+
'amount': amount,
|
254 |
+
'latitude': latitude,
|
255 |
+
'longitude': longitude,
|
256 |
+
'city': city,
|
257 |
+
'country': 'US',
|
258 |
+
'fraud_label': 0
|
259 |
+
})
|
260 |
+
timestamp = current
|
261 |
+
return atms
|
262 |
+
|
263 |
+
def generate_susceptible_cards(credit_cards: list) -> list:
|
264 |
+
"""."""
|
265 |
+
susceptible_cards = []
|
266 |
+
visited_cards = []
|
267 |
+
for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
|
268 |
+
n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud
|
269 |
+
start, end = span
|
270 |
+
for _ in range(n):
|
271 |
+
for card in credit_cards:
|
272 |
+
if card['age'] > start and card['age'] < end:
|
273 |
+
if card['cc_num'] not in visited_cards:
|
274 |
+
current = card
|
275 |
+
visited_cards.append(card['cc_num'])
|
276 |
+
break
|
277 |
+
else:
|
278 |
+
current = None
|
279 |
+
if current is not None:
|
280 |
+
susceptible_cards.append(current)
|
281 |
+
return susceptible_cards
|
282 |
+
|
283 |
+
def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
|
284 |
+
"""."""
|
285 |
+
normal_atm_withdrawals = []
|
286 |
+
atm_transactions = len(cash_amounts)
|
287 |
+
cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
|
288 |
+
atm_count = 0
|
289 |
+
while atm_count < atm_transactions:
|
290 |
+
for card in cash_withdrawal_cards:
|
291 |
+
for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH:
|
292 |
+
# interval in hours between normal cash withdrawals
|
293 |
+
delta = random.randint(6, 168)
|
294 |
+
atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)
|
295 |
+
normal_atm_withdrawals.append(atm_tr)
|
296 |
+
atm_count += ATM_WITHRAWAL_SEQ
|
297 |
+
return normal_atm_withdrawals
|
298 |
+
|
299 |
+
|
300 |
+
def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
|
301 |
+
"""."""
|
302 |
+
timestamps = []
|
303 |
+
timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
|
304 |
+
for _ in range(chain_length):
|
305 |
+
# interval in seconds between fraudulent attacks
|
306 |
+
delta = random.randint(30, 120)
|
307 |
+
current = timestamp + datetime.timedelta(seconds=delta)
|
308 |
+
timestamps.append(current.strftime(DATE_FORMAT))
|
309 |
+
timestamp = current
|
310 |
+
return timestamps
|
311 |
+
|
312 |
+
def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
|
313 |
+
"""."""
|
314 |
+
amounts = []
|
315 |
+
for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
|
316 |
+
n = math.ceil(chain_length * percentage)
|
317 |
+
start, end = span
|
318 |
+
for _ in range(n):
|
319 |
+
amounts.append(get_random_transaction_amount(start, end+1))
|
320 |
+
return amounts[:chain_length]
|
321 |
+
|
322 |
+
|
323 |
+
def update_transactions(transactions: list, chains: list) -> list:
|
324 |
+
"""."""
|
325 |
+
for key, chain in chains.items():
|
326 |
+
transaction = transactions[key]
|
327 |
+
timestamp = transaction['datetime']
|
328 |
+
cc_num = transaction['cc_num']
|
329 |
+
amount = transaction['amount']
|
330 |
+
transaction['fraud_label'] = 1
|
331 |
+
inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
|
332 |
+
inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
|
333 |
+
random.shuffle(inject_amounts)
|
334 |
+
for i, idx in enumerate(chain):
|
335 |
+
original_transaction = transactions[idx]
|
336 |
+
inject_timestamp = inject_timestamps[i]
|
337 |
+
original_transaction['datetime'] = inject_timestamp
|
338 |
+
original_transaction['fraud_label'] = 1
|
339 |
+
original_transaction['cc_num'] = cc_num
|
340 |
+
original_transaction['amount'] = inject_amounts[i]
|
341 |
+
original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
|
342 |
+
original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
|
343 |
+
transactions[idx] = original_transaction
|
344 |
+
|
345 |
+
def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
|
346 |
+
"""."""
|
347 |
+
return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
|
348 |
+
int(FRAUD_RATIO * len(normal_atm_withdrawals)))
|
349 |
+
|
350 |
+
def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
|
351 |
+
cash_amounts: list):
|
352 |
+
"""."""
|
353 |
+
for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
|
354 |
+
# interval in seconds between fraudulent attacks
|
355 |
+
delta = random.randint(1, 5)
|
356 |
+
atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
|
357 |
+
pre_fraudulent_atm_tr = atm_withdrawal[0]
|
358 |
+
fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
|
359 |
+
pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
|
360 |
+
fraudulent_atm_location = faker.location_on_land()
|
361 |
+
while fraudulent_atm_location[3] == 'US':
|
362 |
+
fraudulent_atm_location = faker.location_on_land()
|
363 |
+
fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
|
364 |
+
DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
|
365 |
+
fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
|
366 |
+
fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
|
367 |
+
fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
|
368 |
+
fraudulent_atm_tr['country'] = fraudulent_atm_location[3]
|
369 |
+
fraudulent_atm_tr['fraud_label'] = 1
|
370 |
+
atm_withdrawal.append(fraudulent_atm_tr)
|
371 |
+
normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
|
372 |
+
|
373 |
+
|
374 |
+
def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
|
375 |
+
"""."""
|
376 |
+
for atm_withdrawal in normal_atm_withdrawals:
|
377 |
+
for withdrawal in atm_withdrawal:
|
378 |
+
transactions.append(withdrawal)
|
379 |
+
return pd.DataFrame.from_records(transactions)
|
380 |
+
|
381 |
+
|
382 |
+
def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
|
383 |
+
"""."""
|
384 |
+
df = pd.DataFrame.from_records(credit_cards)
|
385 |
+
# Cast the columns to the correct Pandas DType
|
386 |
+
df['cc_num']= pd.to_numeric(df['cc_num'])
|
387 |
+
return df
|
388 |
+
|
389 |
+
def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
|
390 |
+
"""."""
|
391 |
+
profiles_df = generate_df_with_profiles(credit_cards)
|
392 |
+
return profiles_df
|
393 |
+
|
394 |
+
def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
|
395 |
+
"""."""
|
396 |
+
timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
|
397 |
+
amounts = generate_amounts()
|
398 |
+
categories = generate_categories(amounts)
|
399 |
+
cc_df = create_credit_cards_as_df(credit_cards)
|
400 |
+
transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
|
401 |
+
cash_amounts = generate_cash_amounts()
|
402 |
+
chains = generate_chains()
|
403 |
+
susceptible_cards = generate_susceptible_cards(credit_cards)
|
404 |
+
normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
|
405 |
+
update_transactions(transactions, chains)
|
406 |
+
|
407 |
+
fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
|
408 |
+
update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
|
409 |
+
|
410 |
+
transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
|
411 |
+
|
412 |
+
# Cast the columns to the correct Pandas DType
|
413 |
+
transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
|
414 |
+
transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
|
415 |
+
transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
|
416 |
+
transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
|
417 |
+
|
418 |
+
return transactions_df
|
419 |
+
|