Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import pandas_datareader as pdr | |
import numpy as np | |
import yfinance as yf | |
import requests | |
from bs4 import BeautifulSoup | |
from typing import List | |
from tqdm import tqdm | |
import os | |
import datetime | |
from pandas.tseries.offsets import BDay | |
from datasets import load_dataset | |
import lightgbm as lgb | |
from sklearn.model_selection import TimeSeriesSplit | |
import json | |
data_start_date = '2018-07-01' | |
model_cols = [ | |
'BigNewsDay', | |
'Quarter', | |
'Perf5Day', | |
'Perf5Day_n1', | |
'DaysGreen', | |
'DaysRed', | |
'CurrentHigh30toClose', | |
'CurrentLow30toClose', | |
'CurrentClose30toClose', | |
'CurrentRange30', | |
'GapFill30', | |
'CurrentGap', | |
'RangePct', | |
'RangePct_n1', | |
'RangePct_n2', | |
'OHLC4_VIX', | |
'OHLC4_VIX_n1', | |
'OHLC4_VIX_n2', | |
'OHLC4_Current_Trend', | |
'OHLC4_Trend', | |
'CurrentVIXTrend', | |
'SPX30IntraPerf', | |
'VIX30IntraPerf', | |
'VVIX30IntraPerf', | |
# 'OpenL1', | |
# 'OpenL2', | |
# 'OpenH1', | |
# 'OpenH2', | |
'L1TouchPct', | |
'L2TouchPct', | |
'H1TouchPct', | |
'H2TouchPct', | |
'L1BreakPct', | |
'L2BreakPct', | |
'H1BreakPct', | |
'H2BreakPct', | |
'GreenProbas', | |
'H1BreakTouchPct', | |
'H2BreakTouchPct', | |
'L1BreakTouchPct', | |
'L2BreakTouchPct', | |
'H1BreakH2TouchPct', | |
'L1BreakL2TouchPct', | |
'H1TouchGreenPct', | |
'L1TouchRedPct' | |
# 'GapFillGreenProba' | |
] | |
# If the dataset is gated/private, make sure you have run huggingface-cli login | |
def walk_forward_validation(df, target_column, num_periods): | |
df = df[model_cols + [target_column]] | |
df[target_column] = df[target_column].astype(bool) | |
# Model | |
# model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) | |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want | |
overall_results = [] | |
# Iterate over the rows in the DataFrame, one step at a time | |
# Split the time series data using TimeSeriesSplit | |
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits): | |
# Extract the training and testing data for the current split | |
X_train = df.drop(target_column, axis=1).iloc[train_index] | |
y_train = df[target_column].iloc[train_index] | |
X_test = df.drop(target_column, axis=1).iloc[test_index] | |
y_test = df[target_column].iloc[test_index] | |
y_train = y_train.astype(bool) | |
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) | |
model.fit(X_train, y_train) | |
# Make a prediction on the test data | |
predictions = model.predict_proba(X_test)[:,-1] | |
# Create a DataFrame to store the true and predicted values | |
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index) | |
overall_results.append(result_df) | |
df_results = pd.concat(overall_results) | |
# Calibrate Probabilities | |
def get_quantiles(df, col_name, q): | |
return df.groupby(pd.cut(df[col_name], q))['True'].mean() | |
greenprobas = [] | |
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)): | |
try: | |
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7) | |
for q in df_q.index: | |
if q.left <= pct <= q.right: | |
p = df_q[q] | |
except: | |
p = None | |
greenprobas.append(p) | |
df_results['CalibPredicted'] = greenprobas | |
return df_results, model | |
def seq_predict_proba(df, trained_clf_model): | |
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1] | |
return clf_pred_proba | |
def get_data(periods_30m = 1): | |
# f = open('settings.json') | |
# j = json.load(f) | |
# API_KEY_FRED = j["API_KEY_FRED"] | |
API_KEY_FRED = os.getenv('API_KEY_FRED') | |
def parse_release_dates(release_id: str) -> List[str]: | |
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}' | |
r = requests.get(release_dates_url) | |
text = r.text | |
soup = BeautifulSoup(text, 'xml') | |
dates = [] | |
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}): | |
dates.append(release_date_tag.text) | |
return dates | |
econ_dfs = {} | |
econ_tickers = [ | |
'WALCL', | |
'NFCI', | |
'WRESBAL' | |
] | |
for et in tqdm(econ_tickers, desc='getting econ tickers'): | |
df = pdr.get_data_fred(et) | |
df.index = df.index.rename('ds') | |
econ_dfs[et] = df | |
release_ids = [ | |
"10", # "Consumer Price Index" | |
"46", # "Producer Price Index" | |
"50", # "Employment Situation" | |
"53", # "Gross Domestic Product" | |
"103", # "Discount Rate Meeting Minutes" | |
"180", # "Unemployment Insurance Weekly Claims Report" | |
"194", # "ADP National Employment Report" | |
"323" # "Trimmed Mean PCE Inflation Rate" | |
] | |
release_names = [ | |
"CPI", | |
"PPI", | |
"NFP", | |
"GDP", | |
"FOMC", | |
"UNEMP", | |
"ADP", | |
"PCE" | |
] | |
releases = {} | |
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'): | |
releases[rid] = {} | |
releases[rid]['dates'] = parse_release_dates(rid) | |
releases[rid]['name'] = n | |
# Create a DF that has all dates with the name of the col as 1 | |
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0 | |
# This column serves as the true/false indicator of whether there was economic data released that day. | |
for rid in tqdm(release_ids, desc='Making indicators'): | |
releases[rid]['df'] = pd.DataFrame( | |
index=releases[rid]['dates'], | |
data={ | |
releases[rid]['name']: 1 | |
}) | |
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index) | |
vix = yf.Ticker('^VIX') | |
vvix = yf.Ticker('^VVIX') | |
spx = yf.Ticker('^GSPC') | |
# Pull in data | |
data_files = {"spx": "SPX_full_30min.txt", "vix": "VIX_full_30min.txt", "vvix":'VVIX_full_30min.txt'} | |
data = load_dataset("boomsss/spx_intra", data_files=data_files) | |
dfs = [] | |
for ticker in data.keys(): | |
rows = [d['text'] for d in data[ticker]] | |
rows = [x.split(',') for x in rows] | |
fr = pd.DataFrame(columns=[ | |
'Datetime','Open','High','Low','Close' | |
], data = rows) | |
fr['Datetime'] = pd.to_datetime(fr['Datetime']) | |
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York') | |
fr = fr.set_index('Datetime') | |
fr['Open'] = pd.to_numeric(fr['Open']) | |
fr['High'] = pd.to_numeric(fr['High']) | |
fr['Low'] = pd.to_numeric(fr['Low']) | |
fr['Close'] = pd.to_numeric(fr['Close']) | |
dfs.append(fr) | |
df_30m = pd.concat(dfs, axis=1) | |
df_30m.columns = [ | |
'Open30', | |
'High30', | |
'Low30', | |
'Close30', | |
'Open_VIX30', | |
'High_VIX30', | |
'Low_VIX30', | |
'Close_VIX30', | |
'Open_VVIX30', | |
'High_VVIX30', | |
'Low_VVIX30', | |
'Close_VVIX30' | |
] | |
# Get incremental date | |
last_date = df_30m.index.date[-1] | |
last_date = last_date + datetime.timedelta(days=1) | |
# Get incremental data for each index | |
spx1 = yf.Ticker('^GSPC') | |
vix1 = yf.Ticker('^VIX') | |
vvix1 = yf.Ticker('^VVIX') | |
yfp = spx1.history(start=last_date, interval='30m') | |
yf_vix = vix1.history(start=last_date, interval='30m') | |
yf_vvix = vvix1.history(start=last_date, interval='30m') | |
if len(yfp) > 0: | |
# Convert indexes to EST if not already | |
for _df in [yfp, yf_vix, yf_vvix]: | |
if _df.index.tz.zone != 'America/New_York': | |
_df['Datetime'] = pd.to_datetime(_df.index) | |
_df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York') | |
_df.set_index('Datetime', inplace=True) | |
# Concat them | |
df_inc = pd.concat([ | |
yfp[['Open','High','Low','Close']], | |
yf_vix[['Open','High','Low','Close']], | |
yf_vvix[['Open','High','Low','Close']] | |
], axis=1) | |
df_inc.columns = df_30m.columns | |
df_inc = df_inc.loc[ | |
(df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00)) | |
] | |
df_30m = pd.concat([df_30m, df_inc]) | |
else: | |
df_30m = df_30m.copy() | |
df_30m = df_30m.loc[ | |
(df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00)) | |
] | |
df_30m['dt'] = df_30m.index.date | |
df_30m = df_30m.groupby('dt').head(periods_30m) | |
df_30m = df_30m.set_index('dt',drop=True) | |
df_30m.index.name = 'Datetime' | |
df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1 | |
df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1 | |
df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1 | |
opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1) | |
highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max() | |
lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min() | |
closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1) | |
spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1) | |
vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1) | |
vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1) | |
df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1) | |
prices_vix = vix.history(start=data_start_date, interval='1d') | |
prices_vvix = vvix.history(start=data_start_date, interval='1d') | |
prices_spx = spx.history(start=data_start_date, interval='1d') | |
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index] | |
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date | |
prices_spx.index = prices_spx['index'] | |
prices_spx = prices_spx.drop(columns='index') | |
prices_spx.index = pd.DatetimeIndex(prices_spx.index) | |
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index] | |
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date | |
prices_vix.index = prices_vix['index'] | |
prices_vix = prices_vix.drop(columns='index') | |
prices_vix.index = pd.DatetimeIndex(prices_vix.index) | |
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index] | |
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date | |
prices_vvix.index = prices_vvix['index'] | |
prices_vvix = prices_vvix.drop(columns='index') | |
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index) | |
data = prices_spx.merge(df_intra, left_index=True, right_index=True) | |
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX']) | |
data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX']) | |
# Features | |
data['PrevClose'] = data['Close'].shift(1) | |
data['Perf5Day'] = data['Close'] > data['Close'].shift(5) | |
data['Perf5Day_n1'] = data['Perf5Day'].shift(1) | |
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool) | |
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1 | |
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1 | |
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5) | |
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool) | |
data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5) | |
data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool) | |
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points | |
data['RangePct'] = data['Range'] / data['Close'] | |
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4) | |
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1) | |
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1) | |
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1) | |
data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool) | |
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1) | |
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float) | |
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1) | |
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float) | |
data['RangePct_n1'] = data['RangePct'].shift(1) | |
data['RangePct_n2'] = data['RangePct'].shift(2) | |
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1) | |
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2) | |
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose'] | |
data['CurrentGapHist'] = data['CurrentGap'].copy() | |
data['CurrentGap'] = data['CurrentGap'].shift(-1) | |
data['DayOfWeek'] = pd.to_datetime(data.index) | |
data['DayOfWeek'] = data['DayOfWeek'].dt.day | |
# Intraday features | |
data['CurrentOpen30'] = data['Open30'].shift(-1) | |
data['CurrentHigh30'] = data['High30'].shift(-1) | |
data['CurrentLow30'] = data['Low30'].shift(-1) | |
data['CurrentClose30'] = data['Close30'].shift(-1) | |
data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1) | |
data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4'] | |
data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool) | |
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1 | |
data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1) | |
data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1) | |
data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX'] | |
# Open to High | |
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1 | |
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1 | |
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1 | |
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close'] | |
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])] | |
# Target -- the next day's low | |
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1 | |
data['Target'] = data['Target'].shift(-1) | |
# data['Target'] = data['RangePct'].shift(-1) | |
# Target for clf -- whether tomorrow will close above or below today's close | |
data['Target_clf'] = data['Close'] > data['PrevClose'] | |
data['Target_clf'] = data['Target_clf'].shift(-1) | |
data['DayOfWeek'] = pd.to_datetime(data.index) | |
data['Quarter'] = data['DayOfWeek'].dt.quarter | |
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday | |
# Calculate up | |
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1) | |
# Calculate upSD | |
data['upSD'] = data['up'].rolling(30).std(ddof=0) | |
# Calculate aveUp | |
data['aveUp'] = data['up'].rolling(30).mean() | |
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open'] | |
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open'] | |
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1) | |
data['downSD'] = data['down'].rolling(30).std(ddof=0) | |
data['aveDown'] = data['down'].rolling(30).mean() | |
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open'] | |
data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open'] | |
data = data.assign( | |
L1Touch = lambda x: x['Low'] < x['L1'], | |
L2Touch = lambda x: x['Low'] < x['L2'], | |
H1Touch = lambda x: x['High'] > x['H1'], | |
H2Touch = lambda x: x['High'] > x['H2'], | |
L1Break = lambda x: x['Close'] < x['L1'], | |
L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']), | |
L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']), | |
L2Break = lambda x: x['Close'] < x['L2'], | |
H1Break = lambda x: x['Close'] > x['H1'], | |
H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']), | |
H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']), | |
H2Break = lambda x: x['Close'] > x['H2'], | |
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0), | |
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0), | |
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0), | |
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0), | |
CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0), | |
CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0), | |
CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0), | |
CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0) | |
) | |
data['OpenL1'] = data['OpenL1'].shift(-1) | |
data['OpenL2'] = data['OpenL2'].shift(-1) | |
data['OpenH1'] = data['OpenH1'].shift(-1) | |
data['OpenH2'] = data['OpenH2'].shift(-1) | |
data['CloseL1'] = data['CloseL1'].shift(-1) | |
data['CloseL2'] = data['CloseL2'].shift(-1) | |
data['CloseH1'] = data['CloseH1'].shift(-1) | |
data['CloseH2'] = data['CloseH2'].shift(-1) | |
level_cols = [ | |
'L1Touch', | |
'L2Touch', | |
'H1Touch', | |
'H2Touch', | |
'L1Break', | |
'L2Break', | |
'H1Break', | |
'H2Break' | |
] | |
for col in level_cols: | |
data[col+'Pct'] = data[col].rolling(100).mean() | |
# data[col+'Pct'] = data[col+'Pct'].shift(-1) | |
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum() | |
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum() | |
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum() | |
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum() | |
data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum() | |
data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum() | |
data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum() | |
data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum() | |
def get_quintiles(df, col_name, q): | |
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean() | |
probas = [] | |
# Given the current price level | |
for i, pct in enumerate(data['CurrentClose30toClose']): | |
try: | |
# Split | |
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10) | |
for q in df_q.index: | |
if q.left <= pct <= q.right: | |
p = df_q[q] | |
except: | |
p = None | |
probas.append(p) | |
# gapfills = [] | |
# for i, pct in enumerate(data['CurrentGap']): | |
# try: | |
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5) | |
# for q in df_q.index: | |
# if q.left <= pct <= q.right: | |
# p = df_q[q] | |
# except: | |
# p = None | |
# gapfills.append(p) | |
data['GreenProbas'] = probas | |
# data['GapFillGreenProba'] = gapfills | |
for rid in tqdm(release_ids, desc='Merging econ data'): | |
# Get the name of the release | |
n = releases[rid]['name'] | |
# Merge the corresponding DF of the release | |
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True) | |
# Create a column that shifts the value in the merged column up by 1 | |
data[f'{n}_shift'] = data[n].shift(-1) | |
# Fill the rest with zeroes | |
data[n] = data[n].fillna(0) | |
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0) | |
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1) | |
def cumul_sum(col): | |
nums = [] | |
s = 0 | |
for x in col: | |
if x == 1: | |
s += 1 | |
elif x == 0: | |
s = 0 | |
nums.append(s) | |
return nums | |
consec_green = cumul_sum(data['GreenDay'].values) | |
consec_red = cumul_sum(data['RedDay'].values) | |
data['DaysGreen'] = consec_green | |
data['DaysRed'] = consec_red | |
final_row = data.index[-2] | |
exp_row = data.index[-1] | |
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']] | |
df_final = df_final.dropna(subset=['Target','Target_clf']) | |
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1']) | |
return data, df_final, final_row |