File size: 6,654 Bytes
ce3666c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import pandas as pd
import numpy as np
from datetime import datetime, date
from math import radians
# +
def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame:
"""Used only in feature pipelines (not online inference).
Unit test with DataFrames and sample data.
"""
age_df = trans_df.merge(profiles_df, on="cc_num", how="left")
trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
return trans_df
def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame:
"""Used only in feature pipelines (not online inference).
Unit test with DataFrames and sample data.
"""
card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left")
card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y")
trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D")
return trans_df
# -
def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float:
"""Compute Haversine distance between each consecutive coordinate in (long, lat)."""
if isinstance(long, pd.Series):
long = long.map(lambda x: (x))
else:
long = radians(long)
if isinstance(lat, pd.Series):
lat = lat.map(lambda x: (x))
else:
lat = radians(lat)
if isinstance(long, pd.Series):
prev_long = prev_long.map(lambda x: (x))
else:
prev_long = radians(prev_long)
if isinstance(lat, pd.Series):
prev_lat = prev_lat.map(lambda x: (x))
else:
prev_lat = radians(prev_lat)
long_diff = prev_long - long
lat_diff = prev_lat - lat
a = np.sin(lat_diff/2.0)**2
b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2
c = 2*np.arcsin(np.sqrt(a + b))
return c
def time_delta(prev_datetime: int, current_datetime: int)-> int:
"""Compute time difference between each consecutive transaction."""
return prev_datetime - current_datetime
def time_delta_to_days(time_delta: datetime)-> float:
"""."""
return time_delta.total_seconds() / 86400
def date_to_timestamp(date_obj: datetime)-> int:
return int(date_obj.timestamp() * 1000)
def timestamp_to_date(timestamp: int)-> datetime:
return datetime.fromtimestamp(timestamp // 1000)
def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame:
# Convert coordinates into radians:
trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians)
trans_df.sort_values(["datetime", "cc_num"], inplace=True)
# When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most
# recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end),
# we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang).
trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
.apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\
.reset_index(level=0, drop=True)\
.fillna(0)
# Use the same `shift` operation in Pandas to get the previous row for a given cc_number
trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\
.apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\
.reset_index(level=0, drop=True)
# .fillna(0) # handle the first datetime, which has no previous row when you call `shift`
# Convert time_delta from seconds to days
trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x))
trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0)
trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\
,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]]
# Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC
# on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins.
trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x))
return trans_df
def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame:
cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime")
# Moving average of transaction volume.
df_mavg = pd.DataFrame(cc_group.mean())
df_mavg.columns = ["trans_volume_mavg", "datetime"]
df_mavg = df_mavg.reset_index(level=["cc_num"])
df_mavg = df_mavg.drop(columns=["cc_num", "datetime"])
df_mavg = df_mavg.sort_index()
# Moving standard deviation of transaction volume.
df_std = pd.DataFrame(cc_group.mean())
df_std.columns = ["trans_volume_mstd", "datetime"]
df_std = df_std.reset_index(level=["cc_num"])
df_std = df_std.drop(columns=["cc_num", "datetime"])
df_std = df_std.fillna(0)
df_std = df_std.sort_index()
window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True)
# Moving average of transaction frequency.
df_count = pd.DataFrame(cc_group.mean())
df_count.columns = ["trans_freq", "datetime"]
df_count = df_count.reset_index(level=["cc_num"])
df_count = df_count.drop(columns=["cc_num", "datetime"])
df_count = df_count.sort_index()
window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True)
# Moving average of location difference between consecutive transactions.
cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean()
df_loc_delta_mavg = pd.DataFrame(cc_group)
df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"]
df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"])
df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"])
df_loc_delta_mavg = df_loc_delta_mavg.sort_index()
window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True)
window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True)
return window_aggs_df
|