feranzie commited on
Commit
ce3666c
·
1 Parent(s): 35881d2
Files changed (2) hide show
  1. sml/cc_features.py +146 -0
  2. sml/synthetic_data.py +419 -0
sml/cc_features.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from datetime import datetime, date
5
+ from math import radians
6
+
7
+ # +
8
+ def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
9
+ """Used only in feature pipelines (not online inference).
10
+ Unit test with DataFrames and sample data.
11
+ """
12
+ age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
13
+ trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
14
+ return trans_df
15
+
16
+ def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
17
+ """Used only in feature pipelines (not online inference).
18
+ Unit test with DataFrames and sample data.
19
+ """
20
+ card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
21
+ card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
22
+ trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
23
+ return trans_df
24
+
25
+
26
+ # -
27
+
28
+ def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
29
+ """Compute Haversine distance between each consecutive coordinate in (long, lat)."""
30
+
31
+ if isinstance(long, pd.Series):
32
+ long = long.map(lambda x: (x))
33
+ else:
34
+ long = radians(long)
35
+
36
+ if isinstance(lat, pd.Series):
37
+ lat = lat.map(lambda x: (x))
38
+ else:
39
+ lat = radians(lat)
40
+
41
+ if isinstance(long, pd.Series):
42
+ prev_long = prev_long.map(lambda x: (x))
43
+ else:
44
+ prev_long = radians(prev_long)
45
+
46
+ if isinstance(lat, pd.Series):
47
+ prev_lat = prev_lat.map(lambda x: (x))
48
+ else:
49
+ prev_lat = radians(prev_lat)
50
+
51
+ long_diff = prev_long - long
52
+ lat_diff = prev_lat - lat
53
+
54
+ a = np.sin(lat_diff/2.0)**2
55
+ b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
56
+ c = 2*np.arcsin(np.sqrt(a + b))
57
+
58
+ return c
59
+
60
+
61
+ def time_delta(prev_datetime: int, current_datetime: int)-> int:
62
+ """Compute time difference between each consecutive transaction."""
63
+ return prev_datetime - current_datetime
64
+
65
+ def time_delta_to_days(time_delta: datetime)-> float:
66
+ """."""
67
+ return time_delta.total_seconds() / 86400
68
+
69
+ def date_to_timestamp(date_obj: datetime)-> int:
70
+ return int(date_obj.timestamp() * 1000)
71
+
72
+ def timestamp_to_date(timestamp: int)-> datetime:
73
+ return datetime.fromtimestamp(timestamp // 1000)
74
+
75
+ def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
76
+
77
+ # Convert coordinates into radians:
78
+ trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
79
+
80
+ trans_df.sort_values(["datetime", "cc_num"], inplace=True)
81
+
82
+ # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
83
+ # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
84
+ # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
85
+ trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
86
+ .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
87
+ .reset_index(level=0, drop=True)\
88
+ .fillna(0)
89
+
90
+ # Use the same `shift` operation in Pandas to get the previous row for a given cc_number
91
+ trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
92
+ .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
93
+ .reset_index(level=0, drop=True)
94
+ # .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
95
+
96
+ # Convert time_delta from seconds to days
97
+ trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
98
+ trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)
99
+ trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\
100
+ ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
101
+ # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
102
+ # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
103
+ trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
104
+ return trans_df
105
+
106
+
107
+ def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
108
+
109
+ cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
110
+
111
+ # Moving average of transaction volume.
112
+ df_mavg = pd.DataFrame(cc_group.mean())
113
+ df_mavg.columns = ["trans_volume_mavg", "datetime"]
114
+ df_mavg = df_mavg.reset_index(level=["cc_num"])
115
+ df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
116
+ df_mavg = df_mavg.sort_index()
117
+
118
+ # Moving standard deviation of transaction volume.
119
+ df_std = pd.DataFrame(cc_group.mean())
120
+ df_std.columns = ["trans_volume_mstd", "datetime"]
121
+ df_std = df_std.reset_index(level=["cc_num"])
122
+ df_std = df_std.drop(columns=["cc_num", "datetime"])
123
+ df_std = df_std.fillna(0)
124
+ df_std = df_std.sort_index()
125
+ window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
126
+
127
+ # Moving average of transaction frequency.
128
+ df_count = pd.DataFrame(cc_group.mean())
129
+ df_count.columns = ["trans_freq", "datetime"]
130
+ df_count = df_count.reset_index(level=["cc_num"])
131
+ df_count = df_count.drop(columns=["cc_num", "datetime"])
132
+ df_count = df_count.sort_index()
133
+ window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
134
+
135
+ # Moving average of location difference between consecutive transactions.
136
+ cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
137
+ df_loc_delta_mavg = pd.DataFrame(cc_group)
138
+ df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
139
+ df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
140
+ df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
141
+ df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
142
+ window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
143
+
144
+ window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
145
+
146
+ return window_aggs_df
sml/synthetic_data.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ # %%
4
+ from collections import defaultdict
5
+ from faker import Faker
6
+ import pandas as pd
7
+ import numpy as np
8
+ import datetime
9
+ import hashlib
10
+ import random
11
+ import math
12
+ import os
13
+ import bisect
14
+ from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple
15
+
16
+ # Seed for Reproducibility
17
+ faker = Faker()
18
+ faker.seed_locale('en_US', 0)
19
+
20
+
21
+ def set_random_seed(seed: int):
22
+ random.seed(seed)
23
+ np.random.seed(seed)
24
+ faker.seed_instance(seed)
25
+
26
+ set_random_seed(12345)
27
+
28
+
29
+ TOTAL_UNIQUE_USERS = 1000
30
+ TOTAL_UNIQUE_TRANSACTIONS = 54000
31
+ CASH_WITHRAWAL_CARDS_TOTAL = 2000
32
+ TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200
33
+ ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10]
34
+ NORMAL_ATM_RADIUS = 0.01
35
+ START_DATE = '2022-01-01 00:00:00'
36
+ END_DATE = '2022-03-01 00:00:00'
37
+ DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
38
+
39
+ AMOUNT_DISTRIBUTION_PERCENTAGES = {
40
+ 0.05: (0.01, 1.01),
41
+ 0.075: (1, 11.01),
42
+ 0.525: (10, 100.01),
43
+ 0.25: (100, 1000.01),
44
+ 0.099: (1000, 10000.01),
45
+ 0.001: (10000, 30000.01)
46
+ }
47
+
48
+ CATEGORY_PERC_PRICE = {
49
+ "Grocery": (0.5, 0.01, 100),
50
+ "Restaurant/Cafeteria": (0.2, 1, 100),
51
+ "Health/Beauty": (0.1, 10, 500.01),
52
+ "Domestic Transport": (0.1, 10, 100.01),
53
+ "Clothing": (0.05, 10, 2000.01),
54
+ "Electronics": (0.02, 100, 10000.01),
55
+ "Sports/Outdoors": (0.015, 10, 100.01),
56
+ "Holliday/Travel": (0.014, 10, 100.01),
57
+ "Jewelery": (0.001, 10, 100.01)
58
+ }
59
+
60
+ FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent
61
+ NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS)
62
+ ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10]
63
+
64
+ SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = {
65
+ 0.055: (17, 24),
66
+ 0.0015: (24, 34),
67
+ 0.0015: (34, 44),
68
+ 0.02: (44, 54),
69
+ 0.022: (54, 64),
70
+ 0.1: (64, 74),
71
+ 0.40: (74, 84),
72
+ 0.40: (84, 100),
73
+ }
74
+
75
+
76
+
77
+ def generate_unique_credit_card_numbers(n: int) -> pd.Series:
78
+ """."""
79
+ cc_ids = set()
80
+ for _ in range(n):
81
+ cc_id = faker.credit_card_number(card_type='visa')
82
+ cc_ids.add(cc_id)
83
+ return pd.Series(list(cc_ids))
84
+
85
+ # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS
86
+ # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit
87
+
88
+ def generate_list_credit_card_numbers() -> list:
89
+ """."""
90
+ credit_cards = []
91
+ credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS)
92
+ delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
93
+ delta_time_object + datetime.timedelta(days=-728)
94
+ for cc_num in credit_card_numbers:
95
+ credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")})
96
+ return credit_cards
97
+
98
+ def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame:
99
+ """."""
100
+ profiles = []
101
+ for credit_card in credit_cards:
102
+ address = faker.local_latlng(country_code = 'US')
103
+ age = 0
104
+ profile = None
105
+ while age < 18 or age > 100:
106
+ profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate'])
107
+ dday = profile['birthdate']
108
+ delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day)
109
+ age = int(delta.days / 365)
110
+ profile['City'] = address[2]
111
+ profile['Country'] = address[3]
112
+ profile['cc_num'] = credit_card['cc_num']
113
+ credit_card['age'] = age
114
+ profiles.append(profile)
115
+
116
+ # Cast the columns to the correct Pandas DType
117
+ profiles_df = pd.DataFrame.from_records(profiles)
118
+ profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate'])
119
+ profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num'])
120
+
121
+ return profiles_df
122
+
123
+ # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS
124
+ def generate_timestamps(n: int) -> list:
125
+ """Return a list of timestamps of length 'n'."""
126
+ start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
127
+ end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
128
+ timestamps = list()
129
+ for _ in range(n):
130
+ timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT)
131
+ timestamps.append(timestamp)
132
+ timestamps = sorted(timestamps)
133
+ return timestamps
134
+
135
+ def get_random_transaction_amount(start: float, end: float) -> float:
136
+ """."""
137
+ amt = round(np.random.uniform(start, end), 2)
138
+ return amt
139
+
140
+ def generate_amounts() -> list:
141
+ """."""
142
+ amounts = []
143
+ for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
144
+ n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
145
+ start, end = span
146
+ for _ in range(n):
147
+ amounts.append(get_random_transaction_amount(start, end+1))
148
+ return amounts
149
+
150
+ def generate_categories(amounts) -> list:
151
+ """."""
152
+ categories = []
153
+ for category, category_perc_price in CATEGORY_PERC_PRICE.items():
154
+ percentage, min_price, max_price = category_perc_price
155
+ n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage)
156
+ for _ in range(n):
157
+ min_price_i = bisect.bisect_left(amounts, min_price)
158
+ max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i)
159
+ categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])})
160
+
161
+ random.shuffle(categories)
162
+ return categories
163
+
164
+ def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str:
165
+ """."""
166
+ hashable = f'{timestamp}{credit_card_number}{transaction_amount}'
167
+ hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest()
168
+ return hexdigest
169
+
170
+ def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list:
171
+ """."""
172
+ transactions = []
173
+ for timestamp, category in zip(timestamps, categories):
174
+ credit_card_number = random.choice(credit_card_numbers)
175
+ point_of_tr = faker.local_latlng(country_code = 'US')
176
+ transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount'])
177
+ transactions.append({
178
+ 'tid': transaction_id,
179
+ 'datetime': timestamp,
180
+ 'cc_num': credit_card_number,
181
+ 'category': category['category'],
182
+ 'amount': category['amount'],
183
+ 'latitude': point_of_tr[0],
184
+ 'longitude': point_of_tr[1],
185
+ 'city': point_of_tr[2],
186
+ 'country': point_of_tr[3],
187
+ 'fraud_label': 0
188
+ }
189
+ )
190
+ return transactions
191
+
192
+ def generate_cash_amounts() -> list:
193
+ """."""
194
+ cash_amounts = []
195
+ for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
196
+ n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage)
197
+ start, end = span
198
+ for _ in range(n):
199
+ cash_amounts.append(get_random_transaction_amount(start, end+1))
200
+ return cash_amounts
201
+
202
+ def generate_chains():
203
+ """."""
204
+ visited = set()
205
+ chains = defaultdict(list)
206
+
207
+ def size(chains: dict) -> int:
208
+ counts = {key: len(values)+1 for (key, values) in chains.items()}
209
+ return sum(counts.values())
210
+
211
+
212
+ def generate_attack_chain(i: int):
213
+ chain_length = random.choice(ATTACK_CHAIN_LENGTHS)
214
+ for j in range(1, chain_length):
215
+ if i+j not in visited:
216
+ if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS:
217
+ break
218
+ chains[i].append(i+j)
219
+ visited.add(i+j)
220
+
221
+ while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS:
222
+ i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS))
223
+ if i not in visited:
224
+ generate_attack_chain(i)
225
+ visited.add(i)
226
+ return chains
227
+
228
+ def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \
229
+ delta: int, radius: float = None, country_code = 'US') -> List[Dict]:
230
+ """."""
231
+ atms = []
232
+ if length < 0:
233
+ raise Exception('Length must be > 0')
234
+
235
+ start = datetime.datetime.strptime(START_DATE, DATE_FORMAT)
236
+ end = datetime.datetime.strptime(END_DATE, DATE_FORMAT)
237
+ timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None)
238
+ point_of_tr = faker.local_latlng(country_code = country_code)
239
+ latitude = point_of_tr[0]
240
+ longitude = point_of_tr[1]
241
+ city = point_of_tr[2]
242
+ for _ in range(length):
243
+ current = timestamp + datetime.timedelta(hours=delta)
244
+ if radius is not None:
245
+ latitude = faker.coordinate(latitude, radius)
246
+ longitude = faker.coordinate(longitude, radius)
247
+ amount = random.sample(cash_amounts, 1)[0]
248
+ transaction_id = generate_transaction_id(timestamp, credit_card_number, amount)
249
+ atms.append({'tid': transaction_id,
250
+ 'datetime': current.strftime(DATE_FORMAT),
251
+ 'cc_num': credit_card_number,
252
+ 'category': 'Cash Withdrawal',
253
+ 'amount': amount,
254
+ 'latitude': latitude,
255
+ 'longitude': longitude,
256
+ 'city': city,
257
+ 'country': 'US',
258
+ 'fraud_label': 0
259
+ })
260
+ timestamp = current
261
+ return atms
262
+
263
+ def generate_susceptible_cards(credit_cards: list) -> list:
264
+ """."""
265
+ susceptible_cards = []
266
+ visited_cards = []
267
+ for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items():
268
+ n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud
269
+ start, end = span
270
+ for _ in range(n):
271
+ for card in credit_cards:
272
+ if card['age'] > start and card['age'] < end:
273
+ if card['cc_num'] not in visited_cards:
274
+ current = card
275
+ visited_cards.append(card['cc_num'])
276
+ break
277
+ else:
278
+ current = None
279
+ if current is not None:
280
+ susceptible_cards.append(current)
281
+ return susceptible_cards
282
+
283
+ def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list:
284
+ """."""
285
+ normal_atm_withdrawals = []
286
+ atm_transactions = len(cash_amounts)
287
+ cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1))
288
+ atm_count = 0
289
+ while atm_count < atm_transactions:
290
+ for card in cash_withdrawal_cards:
291
+ for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH:
292
+ # interval in hours between normal cash withdrawals
293
+ delta = random.randint(6, 168)
294
+ atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS)
295
+ normal_atm_withdrawals.append(atm_tr)
296
+ atm_count += ATM_WITHRAWAL_SEQ
297
+ return normal_atm_withdrawals
298
+
299
+
300
+ def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list:
301
+ """."""
302
+ timestamps = []
303
+ timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT)
304
+ for _ in range(chain_length):
305
+ # interval in seconds between fraudulent attacks
306
+ delta = random.randint(30, 120)
307
+ current = timestamp + datetime.timedelta(seconds=delta)
308
+ timestamps.append(current.strftime(DATE_FORMAT))
309
+ timestamp = current
310
+ return timestamps
311
+
312
+ def generate_amounts_for_fraud_attacks(chain_length: int) -> list:
313
+ """."""
314
+ amounts = []
315
+ for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items():
316
+ n = math.ceil(chain_length * percentage)
317
+ start, end = span
318
+ for _ in range(n):
319
+ amounts.append(get_random_transaction_amount(start, end+1))
320
+ return amounts[:chain_length]
321
+
322
+
323
+ def update_transactions(transactions: list, chains: list) -> list:
324
+ """."""
325
+ for key, chain in chains.items():
326
+ transaction = transactions[key]
327
+ timestamp = transaction['datetime']
328
+ cc_num = transaction['cc_num']
329
+ amount = transaction['amount']
330
+ transaction['fraud_label'] = 1
331
+ inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain))
332
+ inject_amounts = generate_amounts_for_fraud_attacks(len(chain))
333
+ random.shuffle(inject_amounts)
334
+ for i, idx in enumerate(chain):
335
+ original_transaction = transactions[idx]
336
+ inject_timestamp = inject_timestamps[i]
337
+ original_transaction['datetime'] = inject_timestamp
338
+ original_transaction['fraud_label'] = 1
339
+ original_transaction['cc_num'] = cc_num
340
+ original_transaction['amount'] = inject_amounts[i]
341
+ original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0]
342
+ original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount)
343
+ transactions[idx] = original_transaction
344
+
345
+ def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list:
346
+ """."""
347
+ return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \
348
+ int(FRAUD_RATIO * len(normal_atm_withdrawals)))
349
+
350
+ def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\
351
+ cash_amounts: list):
352
+ """."""
353
+ for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs:
354
+ # interval in seconds between fraudulent attacks
355
+ delta = random.randint(1, 5)
356
+ atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx]
357
+ pre_fraudulent_atm_tr = atm_withdrawal[0]
358
+ fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number =
359
+ pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0]
360
+ fraudulent_atm_location = faker.location_on_land()
361
+ while fraudulent_atm_location[3] == 'US':
362
+ fraudulent_atm_location = faker.location_on_land()
363
+ fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'],
364
+ DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT)
365
+ fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0]
366
+ fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1]
367
+ fraudulent_atm_tr['city'] = fraudulent_atm_location[2]
368
+ fraudulent_atm_tr['country'] = fraudulent_atm_location[3]
369
+ fraudulent_atm_tr['fraud_label'] = 1
370
+ atm_withdrawal.append(fraudulent_atm_tr)
371
+ normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal
372
+
373
+
374
+ def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame:
375
+ """."""
376
+ for atm_withdrawal in normal_atm_withdrawals:
377
+ for withdrawal in atm_withdrawal:
378
+ transactions.append(withdrawal)
379
+ return pd.DataFrame.from_records(transactions)
380
+
381
+
382
+ def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame:
383
+ """."""
384
+ df = pd.DataFrame.from_records(credit_cards)
385
+ # Cast the columns to the correct Pandas DType
386
+ df['cc_num']= pd.to_numeric(df['cc_num'])
387
+ return df
388
+
389
+ def create_profiles_as_df(credit_cards: list) -> pd.DataFrame:
390
+ """."""
391
+ profiles_df = generate_df_with_profiles(credit_cards)
392
+ return profiles_df
393
+
394
+ def create_transactions_as_df(credit_cards: list) -> pd.DataFrame:
395
+ """."""
396
+ timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS)
397
+ amounts = generate_amounts()
398
+ categories = generate_categories(amounts)
399
+ cc_df = create_credit_cards_as_df(credit_cards)
400
+ transactions = generate_transactions(cc_df['cc_num'], timestamps, categories)
401
+ cash_amounts = generate_cash_amounts()
402
+ chains = generate_chains()
403
+ susceptible_cards = generate_susceptible_cards(credit_cards)
404
+ normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards)
405
+ update_transactions(transactions, chains)
406
+
407
+ fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals)
408
+ update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts)
409
+
410
+ transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals)
411
+
412
+ # Cast the columns to the correct Pandas DType
413
+ transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num'])
414
+ transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude'])
415
+ transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude'])
416
+ transactions_df['datetime']= pd.to_datetime(transactions_df['datetime'])
417
+
418
+ return transactions_df
419
+