Spaces:
Sleeping
Sleeping
adding intraday models
Browse files- app.py +382 -347
- model_1h.py +399 -0
- model_30m.py +387 -0
- model_day.py +323 -0
app.py
CHANGED
@@ -1,326 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import json
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
-
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
-
import os
|
15 |
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
-
import datetime
|
17 |
from pandas.tseries.offsets import BDay
|
18 |
-
from datasets import load_dataset
|
19 |
-
|
20 |
-
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
21 |
-
dataset = load_dataset("boomsss/SPX_full_30min", split="train")
|
22 |
-
|
23 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
24 |
-
|
25 |
-
# Create an XGBRegressor model
|
26 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
27 |
-
model = linear_model.LinearRegression()
|
28 |
-
|
29 |
-
overall_results = []
|
30 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
31 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
32 |
-
# Split the data into training and test sets
|
33 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
34 |
-
y_train = df[target_column].iloc[:i]
|
35 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
36 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
37 |
-
|
38 |
-
# Fit the model to the training data
|
39 |
-
model.fit(X_train, y_train)
|
40 |
-
|
41 |
-
# Make a prediction on the test data
|
42 |
-
predictions = model.predict(X_test)
|
43 |
-
|
44 |
-
# Create a DataFrame to store the true and predicted values
|
45 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
46 |
-
|
47 |
-
overall_results.append(result_df)
|
48 |
-
|
49 |
-
df_results = pd.concat(overall_results)
|
50 |
-
# model.save_model('model_lr.bin')
|
51 |
-
# Return the true and predicted values, and fitted model
|
52 |
-
return df_results, model
|
53 |
-
|
54 |
-
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
55 |
-
|
56 |
-
# Create run the regression model to get its target
|
57 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
58 |
-
# joblib.dump(model1, 'model1.bin')
|
59 |
-
|
60 |
-
# Merge the result df back on the df for feeding into the classifier
|
61 |
-
for_merge = res[['Predicted']]
|
62 |
-
for_merge.columns = ['RegrModelOut']
|
63 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
64 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
65 |
-
df = df.drop(columns=[target_column_regr])
|
66 |
-
df = df[[
|
67 |
-
'CurrentGap','RegrModelOut',target_column_clf
|
68 |
-
]]
|
69 |
-
|
70 |
-
df[target_column_clf] = df[target_column_clf].astype(bool)
|
71 |
-
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
72 |
-
|
73 |
-
# Create an XGBRegressor model
|
74 |
-
model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
75 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
76 |
-
|
77 |
-
overall_results = []
|
78 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
79 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
80 |
-
# Split the data into training and test sets
|
81 |
-
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
82 |
-
y_train = df[target_column_clf].iloc[:i]
|
83 |
-
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
84 |
-
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
85 |
-
|
86 |
-
# Fit the model to the training data
|
87 |
-
model2.fit(X_train, y_train)
|
88 |
-
|
89 |
-
# Make a prediction on the test data
|
90 |
-
predictions = model2.predict_proba(X_test)[:,-1]
|
91 |
-
|
92 |
-
# Create a DataFrame to store the true and predicted values
|
93 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
94 |
-
|
95 |
-
overall_results.append(result_df)
|
96 |
-
|
97 |
-
df_results = pd.concat(overall_results)
|
98 |
-
# model1.save_model('model_ensemble.bin')
|
99 |
-
# joblib.dump(model2, 'model2.bin')
|
100 |
-
# Return the true and predicted values, and fitted model
|
101 |
-
return df_results, model1, model2
|
102 |
-
|
103 |
-
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
104 |
-
regr_pred = trained_reg_model.predict(df)
|
105 |
-
regr_pred = regr_pred > 0
|
106 |
-
new_df = df.copy()
|
107 |
-
new_df['RegrModelOut'] = regr_pred
|
108 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
|
109 |
-
return clf_pred_proba
|
110 |
-
|
111 |
-
def get_data():
|
112 |
-
# f = open('settings.json')
|
113 |
-
# j = json.load(f)
|
114 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
115 |
-
|
116 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
117 |
-
|
118 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
119 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
120 |
-
r = requests.get(release_dates_url)
|
121 |
-
text = r.text
|
122 |
-
soup = BeautifulSoup(text, 'xml')
|
123 |
-
dates = []
|
124 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
125 |
-
dates.append(release_date_tag.text)
|
126 |
-
return dates
|
127 |
-
|
128 |
-
def parse_release_dates_obs(series_id: str) -> List[str]:
|
129 |
-
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
130 |
-
r = requests.get(obs_url)
|
131 |
-
text = r.text
|
132 |
-
soup = BeautifulSoup(text, 'xml')
|
133 |
-
observations = []
|
134 |
-
for observation_tag in soup.find_all('observation'):
|
135 |
-
date = observation_tag.get('date')
|
136 |
-
value = observation_tag.get('value')
|
137 |
-
observations.append((date, value))
|
138 |
-
return observations
|
139 |
-
|
140 |
-
econ_dfs = {}
|
141 |
-
|
142 |
-
econ_tickers = [
|
143 |
-
'WALCL',
|
144 |
-
'NFCI',
|
145 |
-
'WRESBAL'
|
146 |
-
]
|
147 |
-
|
148 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
149 |
-
# p = parse_release_dates_obs(et)
|
150 |
-
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
151 |
-
df = pdr.get_data_fred(et)
|
152 |
-
df.index = df.index.rename('ds')
|
153 |
-
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
154 |
-
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
155 |
-
econ_dfs[et] = df
|
156 |
-
|
157 |
-
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
158 |
-
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
159 |
-
|
160 |
-
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
161 |
-
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
162 |
-
|
163 |
-
release_ids = [
|
164 |
-
"10", # "Consumer Price Index"
|
165 |
-
"46", # "Producer Price Index"
|
166 |
-
"50", # "Employment Situation"
|
167 |
-
"53", # "Gross Domestic Product"
|
168 |
-
"103", # "Discount Rate Meeting Minutes"
|
169 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
170 |
-
"194", # "ADP National Employment Report"
|
171 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
172 |
-
]
|
173 |
-
|
174 |
-
release_names = [
|
175 |
-
"CPI",
|
176 |
-
"PPI",
|
177 |
-
"NFP",
|
178 |
-
"GDP",
|
179 |
-
"FOMC",
|
180 |
-
"UNEMP",
|
181 |
-
"ADP",
|
182 |
-
"PCE"
|
183 |
-
]
|
184 |
-
|
185 |
-
releases = {}
|
186 |
-
|
187 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
188 |
-
releases[rid] = {}
|
189 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
190 |
-
releases[rid]['name'] = n
|
191 |
-
|
192 |
-
# Create a DF that has all dates with the name of the col as 1
|
193 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
194 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
195 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
196 |
-
releases[rid]['df'] = pd.DataFrame(
|
197 |
-
index=releases[rid]['dates'],
|
198 |
-
data={
|
199 |
-
releases[rid]['name']: 1
|
200 |
-
})
|
201 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
202 |
-
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
203 |
-
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
204 |
-
|
205 |
-
vix = yf.Ticker('^VIX')
|
206 |
-
spx = yf.Ticker('^GSPC')
|
207 |
-
|
208 |
-
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
209 |
-
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
210 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
211 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
212 |
-
prices_spx.index = prices_spx['index']
|
213 |
-
prices_spx = prices_spx.drop(columns='index')
|
214 |
-
|
215 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
216 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
217 |
-
prices_vix.index = prices_vix['index']
|
218 |
-
prices_vix = prices_vix.drop(columns='index')
|
219 |
-
|
220 |
-
data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
221 |
-
data.index = pd.DatetimeIndex(data.index)
|
222 |
-
|
223 |
-
# Features
|
224 |
-
data['PrevClose'] = data['Close'].shift(1)
|
225 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
226 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
227 |
-
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
228 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
229 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
230 |
-
|
231 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
232 |
-
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
233 |
-
|
234 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
235 |
-
data['RangePct'] = data['Range'] / data['Close']
|
236 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
237 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
238 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
239 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
240 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
241 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
242 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
243 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
244 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
245 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
246 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
247 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
248 |
-
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
249 |
-
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
250 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
251 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
252 |
-
|
253 |
-
# Target -- the next day's low
|
254 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
255 |
-
data['Target'] = data['Target'].shift(-1)
|
256 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
257 |
-
|
258 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
259 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
260 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
261 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
262 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
263 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
264 |
-
|
265 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
266 |
-
# Get the name of the release
|
267 |
-
n = releases[rid]['name']
|
268 |
-
# Merge the corresponding DF of the release
|
269 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
270 |
-
# Create a column that shifts the value in the merged column up by 1
|
271 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
272 |
-
# Fill the rest with zeroes
|
273 |
-
data[n] = data[n].fillna(0)
|
274 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
275 |
-
|
276 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
277 |
-
|
278 |
-
def cumul_sum(col):
|
279 |
-
nums = []
|
280 |
-
s = 0
|
281 |
-
for x in col:
|
282 |
-
if x == 1:
|
283 |
-
s += 1
|
284 |
-
elif x == 0:
|
285 |
-
s = 0
|
286 |
-
nums.append(s)
|
287 |
-
return nums
|
288 |
-
|
289 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
290 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
291 |
-
|
292 |
-
data['DaysGreen'] = consec_green
|
293 |
-
data['DaysRed'] = consec_red
|
294 |
-
|
295 |
-
final_row = data.index[-2]
|
296 |
-
|
297 |
-
exp_row = data.index[-1]
|
298 |
-
|
299 |
-
df_final = data.loc[:final_row,
|
300 |
-
[
|
301 |
-
'BigNewsDay',
|
302 |
-
'Quarter',
|
303 |
-
'Perf5Day',
|
304 |
-
'Perf5Day_n1',
|
305 |
-
'DaysGreen',
|
306 |
-
'DaysRed',
|
307 |
-
# 'OHLC4_Trend',
|
308 |
-
# 'OHLC4_Trend_n1',
|
309 |
-
# 'OHLC4_Trend_n2',
|
310 |
-
# 'VIX5Day',
|
311 |
-
# 'VIX5Day_n1',
|
312 |
-
'CurrentGap',
|
313 |
-
'RangePct',
|
314 |
-
'RangePct_n1',
|
315 |
-
'RangePct_n2',
|
316 |
-
'OHLC4_VIX',
|
317 |
-
'OHLC4_VIX_n1',
|
318 |
-
'OHLC4_VIX_n2',
|
319 |
-
'Target',
|
320 |
-
'Target_clf'
|
321 |
-
]]
|
322 |
-
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
323 |
-
return data, df_final, final_row
|
324 |
|
325 |
st.set_page_config(
|
326 |
page_title="Gameday Model for $SPX",
|
@@ -333,7 +14,8 @@ st.markdown('**PLEASE NOTE:** Model should be run at or after market open. Docum
|
|
333 |
if st.button("🧹 Clear All"):
|
334 |
st.cache_data.clear()
|
335 |
|
336 |
-
if st.button('
|
|
|
337 |
with st.spinner('Loading data...'):
|
338 |
data, df_final, final_row = get_data()
|
339 |
# st.success("✅ Historical data")
|
@@ -354,11 +36,6 @@ if st.button('🤖 Run it'):
|
|
354 |
'Perf5Day_n1',
|
355 |
'DaysGreen',
|
356 |
'DaysRed',
|
357 |
-
# 'OHLC4_Trend',
|
358 |
-
# 'OHLC4_Trend_n1',
|
359 |
-
# 'OHLC4_Trend_n2',
|
360 |
-
# 'VIX5Day',
|
361 |
-
# 'VIX5Day_n1',
|
362 |
'CurrentGap',
|
363 |
'RangePct',
|
364 |
'RangePct_n1',
|
@@ -379,11 +56,6 @@ if st.button('🤖 Run it'):
|
|
379 |
new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
|
380 |
new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
|
381 |
new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
|
382 |
-
# new_pred['OHLC4_Trend'] = new_pred['OHLC4_Trend'].astype(float)
|
383 |
-
# new_pred['OHLC4_Trend_n1'] = new_pred['OHLC4_Trend_n1'].astype(float)
|
384 |
-
# new_pred['OHLC4_Trend_n2'] = new_pred['OHLC4_Trend_n2'].astype(float)
|
385 |
-
# new_pred['VIX5Day'] = new_pred['VIX5Day'].astype(bool)
|
386 |
-
# new_pred['VIX5Day_n1'] = new_pred['VIX5Day_n1'].astype(bool)
|
387 |
new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
|
388 |
new_pred['RangePct'] = new_pred['RangePct'].astype(float)
|
389 |
new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
|
@@ -396,12 +68,7 @@ if st.button('🤖 Run it'):
|
|
396 |
tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
|
397 |
|
398 |
seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
|
399 |
-
# above_pct_green = res1.loc[res1['Predicted'] >= seq_proba, 'True'].mean()
|
400 |
-
# len_above_pct_green = len(res1.loc[res1['Predicted'] >= seq_proba])
|
401 |
-
# below_pct_red = 1 - res1.loc[res1['Predicted'] <= seq_proba, 'True'].mean()
|
402 |
-
# len_below_pct_red = len(res1.loc[res1['Predicted'] <= seq_proba])
|
403 |
|
404 |
-
# Calc green and red probas
|
405 |
green_proba = seq_proba[0]
|
406 |
red_proba = 1 - green_proba
|
407 |
do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
|
@@ -514,8 +181,7 @@ if st.button('🤖 Run it'):
|
|
514 |
perf_daily = res1.copy()
|
515 |
perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
|
516 |
|
517 |
-
|
518 |
-
tab1.subheader(f'Pred for {curr_date}')
|
519 |
tab1.write(results)
|
520 |
tab1.write(df_probas)
|
521 |
|
@@ -528,16 +194,385 @@ if st.button('🤖 Run it'):
|
|
528 |
tab4.subheader('Performance')
|
529 |
tab4.write(df_performance)
|
530 |
tab4.write(perf_daily)
|
531 |
-
tab4.write(dataset)
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
537 |
-
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
|
540 |
-
|
541 |
-
#
|
542 |
-
|
543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
|
4 |
from pandas.tseries.offsets import BDay
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
st.set_page_config(
|
7 |
page_title="Gameday Model for $SPX",
|
|
|
14 |
if st.button("🧹 Clear All"):
|
15 |
st.cache_data.clear()
|
16 |
|
17 |
+
if st.button('🌞 At Open'):
|
18 |
+
from model_day import *
|
19 |
with st.spinner('Loading data...'):
|
20 |
data, df_final, final_row = get_data()
|
21 |
# st.success("✅ Historical data")
|
|
|
36 |
'Perf5Day_n1',
|
37 |
'DaysGreen',
|
38 |
'DaysRed',
|
|
|
|
|
|
|
|
|
|
|
39 |
'CurrentGap',
|
40 |
'RangePct',
|
41 |
'RangePct_n1',
|
|
|
56 |
new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
|
57 |
new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
|
58 |
new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
|
|
|
|
|
|
|
|
|
|
|
59 |
new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
|
60 |
new_pred['RangePct'] = new_pred['RangePct'].astype(float)
|
61 |
new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
|
|
|
68 |
tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
|
69 |
|
70 |
seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
|
|
|
|
|
|
|
|
|
71 |
|
|
|
72 |
green_proba = seq_proba[0]
|
73 |
red_proba = 1 - green_proba
|
74 |
do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
|
|
|
181 |
perf_daily = res1.copy()
|
182 |
perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
|
183 |
|
184 |
+
tab1.subheader(f'Pred for {curr_date} as of 6:30AM PST')
|
|
|
185 |
tab1.write(results)
|
186 |
tab1.write(df_probas)
|
187 |
|
|
|
194 |
tab4.subheader('Performance')
|
195 |
tab4.write(df_performance)
|
196 |
tab4.write(perf_daily)
|
|
|
197 |
|
198 |
+
if st.button('⌚ After 30 Mins'):
|
199 |
+
from model_30m import *
|
200 |
+
with st.spinner('Loading data...'):
|
201 |
+
data, df_final, final_row = get_data()
|
202 |
+
# st.success("✅ Historical data")
|
203 |
+
|
204 |
+
with st.spinner("Training models..."):
|
205 |
+
def train_models():
|
206 |
+
res1, xgbr, seq2 = walk_forward_validation_seq(df_final.dropna(), 'Target_clf', 'Target', 100, 1)
|
207 |
+
return res1, xgbr, seq2
|
208 |
+
res1, xgbr, seq2 = train_models()
|
209 |
+
# st.success("✅ Models trained")
|
210 |
+
|
211 |
+
with st.spinner("Getting new prediction..."):
|
212 |
+
|
213 |
+
# Get last row
|
214 |
+
new_pred = data.loc[final_row, ['BigNewsDay',
|
215 |
+
'Quarter',
|
216 |
+
'Perf5Day',
|
217 |
+
'Perf5Day_n1',
|
218 |
+
'DaysGreen',
|
219 |
+
'DaysRed',
|
220 |
+
'CurrentHigh30toClose',
|
221 |
+
'CurrentLow30toClose',
|
222 |
+
'CurrentClose30toClose',
|
223 |
+
'CurrentRange30',
|
224 |
+
'GapFill30',
|
225 |
+
'CurrentGap',
|
226 |
+
'RangePct',
|
227 |
+
'RangePct_n1',
|
228 |
+
'RangePct_n2',
|
229 |
+
'OHLC4_VIX',
|
230 |
+
'OHLC4_VIX_n1',
|
231 |
+
'OHLC4_VIX_n2']]
|
232 |
+
|
233 |
+
new_pred = pd.DataFrame(new_pred).T
|
234 |
+
# new_pred_show = pd.DataFrame(index=[new_pred.columns], columns=[new_pred.index], data=[[v] for v in new_pred.values])
|
235 |
+
# last_date = datetime.datetime.strptime(data.loc[final_row], '%Y-%m-%d')
|
236 |
+
curr_date = final_row + BDay(1)
|
237 |
+
curr_date = curr_date.strftime('%Y-%m-%d')
|
238 |
+
|
239 |
+
new_pred['BigNewsDay'] = new_pred['BigNewsDay'].astype(float)
|
240 |
+
new_pred['Quarter'] = new_pred['Quarter'].astype(int)
|
241 |
+
new_pred['Perf5Day'] = new_pred['Perf5Day'].astype(bool)
|
242 |
+
new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
|
243 |
+
new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
|
244 |
+
new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
|
245 |
+
new_pred['CurrentHigh30toClose'] = new_pred['CurrentHigh30toClose'].astype(float)
|
246 |
+
new_pred['CurrentLow30toClose'] = new_pred['CurrentLow30toClose'].astype(float)
|
247 |
+
new_pred['CurrentClose30toClose'] = new_pred['CurrentClose30toClose'].astype(float)
|
248 |
+
new_pred['CurrentRange30'] = new_pred['CurrentRange30'].astype(float)
|
249 |
+
new_pred['GapFill30'] = new_pred['GapFill30'].astype(float)
|
250 |
+
new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
|
251 |
+
new_pred['RangePct'] = new_pred['RangePct'].astype(float)
|
252 |
+
new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
|
253 |
+
new_pred['RangePct_n2'] = new_pred['RangePct_n2'].astype(float)
|
254 |
+
new_pred['OHLC4_VIX'] = new_pred['OHLC4_VIX'].astype(float)
|
255 |
+
new_pred['OHLC4_VIX_n1'] = new_pred['OHLC4_VIX_n1'].astype(float)
|
256 |
+
new_pred['OHLC4_VIX_n2'] = new_pred['OHLC4_VIX_n2'].astype(float)
|
257 |
+
|
258 |
+
st.success("✅ All done!")
|
259 |
+
tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
|
260 |
+
|
261 |
+
seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
|
262 |
+
|
263 |
+
green_proba = seq_proba[0]
|
264 |
+
red_proba = 1 - green_proba
|
265 |
+
do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
|
266 |
+
stdev = 0.01
|
267 |
+
score = None
|
268 |
+
num_obs = None
|
269 |
+
cond = None
|
270 |
+
historical_proba = None
|
271 |
+
text_cond = None
|
272 |
+
operator = None
|
273 |
+
|
274 |
+
if do_not_play:
|
275 |
+
text_cond = '🟨'
|
276 |
+
operator = ''
|
277 |
+
score = seq_proba[0]
|
278 |
+
cond = (res1['Predicted'] > 0.4) & (res1['Predicted'] <= 0.6)
|
279 |
+
num_obs = len(res1.loc[cond])
|
280 |
+
historical_proba = res1.loc[cond, 'True'].mean()
|
281 |
+
|
282 |
+
|
283 |
+
elif green_proba > red_proba:
|
284 |
+
# If the day is predicted to be green, say so
|
285 |
+
text_cond = '🟩'
|
286 |
+
operator = '>='
|
287 |
+
score = green_proba
|
288 |
+
# How many with this score?
|
289 |
+
cond = (res1['Predicted'] >= green_proba)
|
290 |
+
num_obs = len(res1.loc[cond])
|
291 |
+
# How often green?
|
292 |
+
historical_proba = res1.loc[cond, 'True'].mean()
|
293 |
+
# print(cond)
|
294 |
+
|
295 |
+
elif green_proba <= red_proba:
|
296 |
+
# If the day is predicted to be green, say so
|
297 |
+
text_cond = '🟥'
|
298 |
+
operator = '<='
|
299 |
+
score = red_proba
|
300 |
+
# How many with this score?
|
301 |
+
cond = (res1['Predicted'] <= red_proba)
|
302 |
+
num_obs = len(res1.loc[cond])
|
303 |
+
# How often green?
|
304 |
+
historical_proba = 1 - res1.loc[cond, 'True'].mean()
|
305 |
+
# print(cond)
|
306 |
+
|
307 |
+
score_fmt = f'{score:.1%}'
|
308 |
+
|
309 |
+
results = pd.DataFrame(index=[
|
310 |
+
'PrevClose',
|
311 |
+
'Confidence Score',
|
312 |
+
'Success Rate',
|
313 |
+
f'NumObs {operator} {"" if do_not_play else score_fmt}',
|
314 |
+
], data = [
|
315 |
+
f"{data.loc[final_row,'Close']:.2f}",
|
316 |
+
f'{text_cond} {score:.1%}',
|
317 |
+
f'{historical_proba:.1%}',
|
318 |
+
num_obs,
|
319 |
+
])
|
320 |
+
|
321 |
+
results.columns = ['Outputs']
|
322 |
+
|
323 |
+
# st.subheader('New Prediction')
|
324 |
+
|
325 |
+
# df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
|
326 |
+
df_probas = res1.groupby(pd.cut(res1['Predicted'],[-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf])).agg({'True':[np.mean,len,np.sum]})
|
327 |
+
df_probas.columns = ['PctGreen','NumObs','NumGreen']
|
328 |
+
|
329 |
+
roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
|
330 |
+
precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
|
331 |
+
recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
|
332 |
+
len_all = len(res1)
|
333 |
+
|
334 |
+
res2_filtered = res1.loc[(res1['Predicted'] > 0.6) | (res1['Predicted'] <= 0.4)]
|
335 |
+
|
336 |
+
roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
|
337 |
+
precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
|
338 |
+
recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
|
339 |
+
len_hi = len(res2_filtered)
|
340 |
+
|
341 |
+
df_performance = pd.DataFrame(
|
342 |
+
index=[
|
343 |
+
'N',
|
344 |
+
'ROC AUC',
|
345 |
+
'Precision',
|
346 |
+
'Recall'
|
347 |
+
],
|
348 |
+
columns = [
|
349 |
+
'All',
|
350 |
+
'High Confidence'
|
351 |
+
],
|
352 |
+
data = [
|
353 |
+
[len_all, len_hi],
|
354 |
+
[roc_auc_score_all, roc_auc_score_hi],
|
355 |
+
[precision_score_all, precision_score_hi],
|
356 |
+
[recall_score_all, recall_score_hi]
|
357 |
+
]
|
358 |
+
).round(2)
|
359 |
+
|
360 |
+
def get_acc(t, p):
|
361 |
+
if t == False and p <= 0.4:
|
362 |
+
return '✅'
|
363 |
+
elif t == True and p > 0.6:
|
364 |
+
return '✅'
|
365 |
+
elif t == False and p > 0.6:
|
366 |
+
return '❌'
|
367 |
+
elif t == True and p <= 0.4:
|
368 |
+
return '❌'
|
369 |
+
else:
|
370 |
+
return '🟨'
|
371 |
+
|
372 |
+
perf_daily = res1.copy()
|
373 |
+
perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
|
374 |
+
|
375 |
+
tab1.subheader(f'Pred for {curr_date} as of 7AM PST')
|
376 |
+
tab1.write(results)
|
377 |
+
tab1.write(df_probas)
|
378 |
+
|
379 |
+
tab2.subheader('Latest Data for Pred')
|
380 |
+
tab2.write(new_pred)
|
381 |
+
|
382 |
+
tab3.subheader('Historical Data')
|
383 |
+
tab3.write(df_final)
|
384 |
+
|
385 |
+
tab4.subheader('Performance')
|
386 |
+
tab4.write(df_performance)
|
387 |
+
tab4.write(perf_daily.sort_index(ascending=False))
|
388 |
+
|
389 |
+
if st.button('⏳ After 60 Mins'):
|
390 |
+
from model_1h import *
|
391 |
+
with st.spinner('Loading data...'):
|
392 |
+
data, df_final, final_row = get_data()
|
393 |
+
# st.success("✅ Historical data")
|
394 |
+
|
395 |
+
with st.spinner("Training models..."):
|
396 |
+
def train_models():
|
397 |
+
res1, xgbr, seq2 = walk_forward_validation_seq(df_final.dropna(), 'Target_clf', 'Target', 100, 1)
|
398 |
+
return res1, xgbr, seq2
|
399 |
+
res1, xgbr, seq2 = train_models()
|
400 |
+
# st.success("✅ Models trained")
|
401 |
+
|
402 |
+
with st.spinner("Getting new prediction..."):
|
403 |
+
|
404 |
+
# Get last row
|
405 |
+
new_pred = data.loc[final_row, ['BigNewsDay',
|
406 |
+
'Quarter',
|
407 |
+
'Perf5Day',
|
408 |
+
'Perf5Day_n1',
|
409 |
+
'DaysGreen',
|
410 |
+
'DaysRed',
|
411 |
+
'CurrentHigh30toClose',
|
412 |
+
'CurrentLow30toClose',
|
413 |
+
'CurrentClose30toClose',
|
414 |
+
'CurrentRange30',
|
415 |
+
'GapFill30',
|
416 |
+
'CurrentGap',
|
417 |
+
'RangePct',
|
418 |
+
'RangePct_n1',
|
419 |
+
'RangePct_n2',
|
420 |
+
'OHLC4_VIX',
|
421 |
+
'OHLC4_VIX_n1',
|
422 |
+
'OHLC4_VIX_n2']]
|
423 |
+
|
424 |
+
new_pred = pd.DataFrame(new_pred).T
|
425 |
+
# new_pred_show = pd.DataFrame(index=[new_pred.columns], columns=[new_pred.index], data=[[v] for v in new_pred.values])
|
426 |
+
# last_date = datetime.datetime.strptime(data.loc[final_row], '%Y-%m-%d')
|
427 |
+
curr_date = final_row + BDay(1)
|
428 |
+
curr_date = curr_date.strftime('%Y-%m-%d')
|
429 |
+
|
430 |
+
new_pred['BigNewsDay'] = new_pred['BigNewsDay'].astype(float)
|
431 |
+
new_pred['Quarter'] = new_pred['Quarter'].astype(int)
|
432 |
+
new_pred['Perf5Day'] = new_pred['Perf5Day'].astype(bool)
|
433 |
+
new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
|
434 |
+
new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
|
435 |
+
new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
|
436 |
+
new_pred['CurrentHigh30toClose'] = new_pred['CurrentHigh30toClose'].astype(float)
|
437 |
+
new_pred['CurrentLow30toClose'] = new_pred['CurrentLow30toClose'].astype(float)
|
438 |
+
new_pred['CurrentClose30toClose'] = new_pred['CurrentClose30toClose'].astype(float)
|
439 |
+
new_pred['CurrentRange30'] = new_pred['CurrentRange30'].astype(float)
|
440 |
+
new_pred['GapFill30'] = new_pred['GapFill30'].astype(float)
|
441 |
+
new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
|
442 |
+
new_pred['RangePct'] = new_pred['RangePct'].astype(float)
|
443 |
+
new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
|
444 |
+
new_pred['RangePct_n2'] = new_pred['RangePct_n2'].astype(float)
|
445 |
+
new_pred['OHLC4_VIX'] = new_pred['OHLC4_VIX'].astype(float)
|
446 |
+
new_pred['OHLC4_VIX_n1'] = new_pred['OHLC4_VIX_n1'].astype(float)
|
447 |
+
new_pred['OHLC4_VIX_n2'] = new_pred['OHLC4_VIX_n2'].astype(float)
|
448 |
+
|
449 |
+
st.success("✅ All done!")
|
450 |
+
tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
|
451 |
+
|
452 |
+
seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
|
453 |
+
|
454 |
+
green_proba = seq_proba[0]
|
455 |
+
red_proba = 1 - green_proba
|
456 |
+
do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
|
457 |
+
stdev = 0.01
|
458 |
+
score = None
|
459 |
+
num_obs = None
|
460 |
+
cond = None
|
461 |
+
historical_proba = None
|
462 |
+
text_cond = None
|
463 |
+
operator = None
|
464 |
+
|
465 |
+
if do_not_play:
|
466 |
+
text_cond = '🟨'
|
467 |
+
operator = ''
|
468 |
+
score = seq_proba[0]
|
469 |
+
cond = (res1['Predicted'] > 0.4) & (res1['Predicted'] <= 0.6)
|
470 |
+
num_obs = len(res1.loc[cond])
|
471 |
+
historical_proba = res1.loc[cond, 'True'].mean()
|
472 |
|
473 |
+
|
474 |
+
elif green_proba > red_proba:
|
475 |
+
# If the day is predicted to be green, say so
|
476 |
+
text_cond = '🟩'
|
477 |
+
operator = '>='
|
478 |
+
score = green_proba
|
479 |
+
# How many with this score?
|
480 |
+
cond = (res1['Predicted'] >= green_proba)
|
481 |
+
num_obs = len(res1.loc[cond])
|
482 |
+
# How often green?
|
483 |
+
historical_proba = res1.loc[cond, 'True'].mean()
|
484 |
+
# print(cond)
|
485 |
|
486 |
+
elif green_proba <= red_proba:
|
487 |
+
# If the day is predicted to be green, say so
|
488 |
+
text_cond = '🟥'
|
489 |
+
operator = '<='
|
490 |
+
score = red_proba
|
491 |
+
# How many with this score?
|
492 |
+
cond = (res1['Predicted'] <= red_proba)
|
493 |
+
num_obs = len(res1.loc[cond])
|
494 |
+
# How often green?
|
495 |
+
historical_proba = 1 - res1.loc[cond, 'True'].mean()
|
496 |
+
# print(cond)
|
497 |
+
|
498 |
+
score_fmt = f'{score:.1%}'
|
499 |
+
|
500 |
+
results = pd.DataFrame(index=[
|
501 |
+
'PrevClose',
|
502 |
+
'Confidence Score',
|
503 |
+
'Success Rate',
|
504 |
+
f'NumObs {operator} {"" if do_not_play else score_fmt}',
|
505 |
+
], data = [
|
506 |
+
f"{data.loc[final_row,'Close']:.2f}",
|
507 |
+
f'{text_cond} {score:.1%}',
|
508 |
+
f'{historical_proba:.1%}',
|
509 |
+
num_obs,
|
510 |
+
])
|
511 |
+
|
512 |
+
results.columns = ['Outputs']
|
513 |
+
|
514 |
+
# st.subheader('New Prediction')
|
515 |
+
|
516 |
+
# df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
|
517 |
+
df_probas = res1.groupby(pd.cut(res1['Predicted'],[-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf])).agg({'True':[np.mean,len,np.sum]})
|
518 |
+
df_probas.columns = ['PctGreen','NumObs','NumGreen']
|
519 |
+
|
520 |
+
roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
|
521 |
+
precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
|
522 |
+
recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
|
523 |
+
len_all = len(res1)
|
524 |
+
|
525 |
+
res2_filtered = res1.loc[(res1['Predicted'] > 0.6) | (res1['Predicted'] <= 0.4)]
|
526 |
+
|
527 |
+
roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
|
528 |
+
precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
|
529 |
+
recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
|
530 |
+
len_hi = len(res2_filtered)
|
531 |
+
|
532 |
+
df_performance = pd.DataFrame(
|
533 |
+
index=[
|
534 |
+
'N',
|
535 |
+
'ROC AUC',
|
536 |
+
'Precision',
|
537 |
+
'Recall'
|
538 |
+
],
|
539 |
+
columns = [
|
540 |
+
'All',
|
541 |
+
'High Confidence'
|
542 |
+
],
|
543 |
+
data = [
|
544 |
+
[len_all, len_hi],
|
545 |
+
[roc_auc_score_all, roc_auc_score_hi],
|
546 |
+
[precision_score_all, precision_score_hi],
|
547 |
+
[recall_score_all, recall_score_hi]
|
548 |
+
]
|
549 |
+
).round(2)
|
550 |
+
|
551 |
+
def get_acc(t, p):
|
552 |
+
if t == False and p <= 0.4:
|
553 |
+
return '✅'
|
554 |
+
elif t == True and p > 0.6:
|
555 |
+
return '✅'
|
556 |
+
elif t == False and p > 0.6:
|
557 |
+
return '❌'
|
558 |
+
elif t == True and p <= 0.4:
|
559 |
+
return '❌'
|
560 |
+
else:
|
561 |
+
return '🟨'
|
562 |
+
|
563 |
+
perf_daily = res1.copy()
|
564 |
+
perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
|
565 |
+
|
566 |
+
tab1.subheader(f'Pred for {curr_date} as of 7:30AM PST')
|
567 |
+
tab1.write(results)
|
568 |
+
tab1.write(df_probas)
|
569 |
+
|
570 |
+
tab2.subheader('Latest Data for Pred')
|
571 |
+
tab2.write(new_pred)
|
572 |
+
|
573 |
+
tab3.subheader('Historical Data')
|
574 |
+
tab3.write(df_final)
|
575 |
+
|
576 |
+
tab4.subheader('Performance')
|
577 |
+
tab4.write(df_performance)
|
578 |
+
tab4.write(perf_daily)
|
model_1h.py
ADDED
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import pandas_datareader as pdr
|
4 |
+
import numpy as np
|
5 |
+
import yfinance as yf
|
6 |
+
import json
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from typing import List
|
10 |
+
import xgboost as xgb
|
11 |
+
from tqdm import tqdm
|
12 |
+
from sklearn import linear_model
|
13 |
+
import joblib
|
14 |
+
import os
|
15 |
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
+
import datetime
|
17 |
+
from pandas.tseries.offsets import BDay
|
18 |
+
from datasets import load_dataset
|
19 |
+
|
20 |
+
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
21 |
+
|
22 |
+
# Create an XGBRegressor model
|
23 |
+
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
24 |
+
model = linear_model.LinearRegression()
|
25 |
+
|
26 |
+
overall_results = []
|
27 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
28 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
29 |
+
# Split the data into training and test sets
|
30 |
+
X_train = df.drop(target_column, axis=1).iloc[:i]
|
31 |
+
y_train = df[target_column].iloc[:i]
|
32 |
+
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
33 |
+
y_test = df[target_column].iloc[i:i+num_periods]
|
34 |
+
|
35 |
+
# Fit the model to the training data
|
36 |
+
model.fit(X_train, y_train)
|
37 |
+
|
38 |
+
# Make a prediction on the test data
|
39 |
+
predictions = model.predict(X_test)
|
40 |
+
|
41 |
+
# Create a DataFrame to store the true and predicted values
|
42 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
43 |
+
|
44 |
+
overall_results.append(result_df)
|
45 |
+
|
46 |
+
df_results = pd.concat(overall_results)
|
47 |
+
# model.save_model('model_lr.bin')
|
48 |
+
# Return the true and predicted values, and fitted model
|
49 |
+
return df_results, model
|
50 |
+
|
51 |
+
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
52 |
+
|
53 |
+
# Create run the regression model to get its target
|
54 |
+
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
55 |
+
# joblib.dump(model1, 'model1.bin')
|
56 |
+
|
57 |
+
# Merge the result df back on the df for feeding into the classifier
|
58 |
+
for_merge = res[['Predicted']]
|
59 |
+
for_merge.columns = ['RegrModelOut']
|
60 |
+
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
61 |
+
df = df.merge(for_merge, left_index=True, right_index=True)
|
62 |
+
df = df.drop(columns=[target_column_regr])
|
63 |
+
df = df[[
|
64 |
+
'CurrentGap','RegrModelOut',
|
65 |
+
'CurrentHigh30toClose',
|
66 |
+
'CurrentLow30toClose',
|
67 |
+
'CurrentClose30toClose',
|
68 |
+
'CurrentRange30',
|
69 |
+
'GapFill30',target_column_clf
|
70 |
+
]]
|
71 |
+
|
72 |
+
df[target_column_clf] = df[target_column_clf].astype(bool)
|
73 |
+
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
74 |
+
|
75 |
+
# Create an XGBRegressor model
|
76 |
+
model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
77 |
+
# model = linear_model.LogisticRegression(max_iter=1500)
|
78 |
+
|
79 |
+
overall_results = []
|
80 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
81 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
82 |
+
# Split the data into training and test sets
|
83 |
+
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
84 |
+
y_train = df[target_column_clf].iloc[:i]
|
85 |
+
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
86 |
+
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
87 |
+
|
88 |
+
# Fit the model to the training data
|
89 |
+
model2.fit(X_train, y_train)
|
90 |
+
|
91 |
+
# Make a prediction on the test data
|
92 |
+
predictions = model2.predict_proba(X_test)[:,-1]
|
93 |
+
|
94 |
+
# Create a DataFrame to store the true and predicted values
|
95 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
96 |
+
|
97 |
+
overall_results.append(result_df)
|
98 |
+
|
99 |
+
df_results = pd.concat(overall_results)
|
100 |
+
# model1.save_model('model_ensemble.bin')
|
101 |
+
# joblib.dump(model2, 'model2.bin')
|
102 |
+
# Return the true and predicted values, and fitted model
|
103 |
+
return df_results, model1, model2
|
104 |
+
|
105 |
+
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
106 |
+
regr_pred = trained_reg_model.predict(df)
|
107 |
+
regr_pred = regr_pred > 0
|
108 |
+
new_df = df.copy()
|
109 |
+
new_df['RegrModelOut'] = regr_pred
|
110 |
+
clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut',
|
111 |
+
'CurrentHigh30toClose',
|
112 |
+
'CurrentLow30toClose',
|
113 |
+
'CurrentClose30toClose',
|
114 |
+
'CurrentRange30',
|
115 |
+
'GapFill30']])[:,-1]
|
116 |
+
return clf_pred_proba
|
117 |
+
|
118 |
+
def get_data():
|
119 |
+
# f = open('settings.json')
|
120 |
+
# j = json.load(f)
|
121 |
+
# API_KEY_FRED = j["API_KEY_FRED"]
|
122 |
+
|
123 |
+
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
124 |
+
|
125 |
+
def parse_release_dates(release_id: str) -> List[str]:
|
126 |
+
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
127 |
+
r = requests.get(release_dates_url)
|
128 |
+
text = r.text
|
129 |
+
soup = BeautifulSoup(text, 'xml')
|
130 |
+
dates = []
|
131 |
+
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
132 |
+
dates.append(release_date_tag.text)
|
133 |
+
return dates
|
134 |
+
|
135 |
+
def parse_release_dates_obs(series_id: str) -> List[str]:
|
136 |
+
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
137 |
+
r = requests.get(obs_url)
|
138 |
+
text = r.text
|
139 |
+
soup = BeautifulSoup(text, 'xml')
|
140 |
+
observations = []
|
141 |
+
for observation_tag in soup.find_all('observation'):
|
142 |
+
date = observation_tag.get('date')
|
143 |
+
value = observation_tag.get('value')
|
144 |
+
observations.append((date, value))
|
145 |
+
return observations
|
146 |
+
|
147 |
+
econ_dfs = {}
|
148 |
+
|
149 |
+
econ_tickers = [
|
150 |
+
'WALCL',
|
151 |
+
'NFCI',
|
152 |
+
'WRESBAL'
|
153 |
+
]
|
154 |
+
|
155 |
+
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
156 |
+
# p = parse_release_dates_obs(et)
|
157 |
+
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
158 |
+
df = pdr.get_data_fred(et)
|
159 |
+
df.index = df.index.rename('ds')
|
160 |
+
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
161 |
+
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
162 |
+
econ_dfs[et] = df
|
163 |
+
|
164 |
+
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
165 |
+
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
166 |
+
|
167 |
+
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
168 |
+
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
169 |
+
|
170 |
+
release_ids = [
|
171 |
+
"10", # "Consumer Price Index"
|
172 |
+
"46", # "Producer Price Index"
|
173 |
+
"50", # "Employment Situation"
|
174 |
+
"53", # "Gross Domestic Product"
|
175 |
+
"103", # "Discount Rate Meeting Minutes"
|
176 |
+
"180", # "Unemployment Insurance Weekly Claims Report"
|
177 |
+
"194", # "ADP National Employment Report"
|
178 |
+
"323" # "Trimmed Mean PCE Inflation Rate"
|
179 |
+
]
|
180 |
+
|
181 |
+
release_names = [
|
182 |
+
"CPI",
|
183 |
+
"PPI",
|
184 |
+
"NFP",
|
185 |
+
"GDP",
|
186 |
+
"FOMC",
|
187 |
+
"UNEMP",
|
188 |
+
"ADP",
|
189 |
+
"PCE"
|
190 |
+
]
|
191 |
+
|
192 |
+
releases = {}
|
193 |
+
|
194 |
+
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
195 |
+
releases[rid] = {}
|
196 |
+
releases[rid]['dates'] = parse_release_dates(rid)
|
197 |
+
releases[rid]['name'] = n
|
198 |
+
|
199 |
+
# Create a DF that has all dates with the name of the col as 1
|
200 |
+
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
201 |
+
# This column serves as the true/false indicator of whether there was economic data released that day.
|
202 |
+
for rid in tqdm(release_ids, desc='Making indicators'):
|
203 |
+
releases[rid]['df'] = pd.DataFrame(
|
204 |
+
index=releases[rid]['dates'],
|
205 |
+
data={
|
206 |
+
releases[rid]['name']: 1
|
207 |
+
})
|
208 |
+
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
209 |
+
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
210 |
+
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
211 |
+
|
212 |
+
vix = yf.Ticker('^VIX')
|
213 |
+
spx = yf.Ticker('^GSPC')
|
214 |
+
|
215 |
+
|
216 |
+
# Pull in data
|
217 |
+
data = load_dataset("boomsss/SPX_full_30min", split='train')
|
218 |
+
|
219 |
+
rows = [d['text'] for d in data]
|
220 |
+
rows = [x.split(',') for x in rows]
|
221 |
+
|
222 |
+
fr = pd.DataFrame(columns=[
|
223 |
+
'Datetime','Open','High','Low','Close'
|
224 |
+
], data = rows)
|
225 |
+
|
226 |
+
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
227 |
+
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
228 |
+
fr = fr.set_index('Datetime')
|
229 |
+
fr['Open'] = pd.to_numeric(fr['Open'])
|
230 |
+
fr['High'] = pd.to_numeric(fr['High'])
|
231 |
+
fr['Low'] = pd.to_numeric(fr['Low'])
|
232 |
+
fr['Close'] = pd.to_numeric(fr['Close'])
|
233 |
+
|
234 |
+
# Get incremental date
|
235 |
+
last_date = fr.index.date[-1]
|
236 |
+
last_date = last_date + datetime.timedelta(days=1)
|
237 |
+
# Get incremental data
|
238 |
+
spx1 = yf.Ticker('^GSPC')
|
239 |
+
yfp = spx1.history(start=last_date, interval='60m')
|
240 |
+
# Concat current and incremental
|
241 |
+
df_30m = pd.concat([fr, yfp])
|
242 |
+
# Get the first 30 minute bar
|
243 |
+
df_30m = df_30m.reset_index()
|
244 |
+
df_30m['Datetime'] = df_30m['Datetime'].dt.date
|
245 |
+
df_30m = df_30m.groupby('Datetime').head(2)
|
246 |
+
df_30m = df_30m.set_index('Datetime',drop=True)
|
247 |
+
# Rename the columns
|
248 |
+
df_30m = df_30m[['Open','High','Low','Close']]
|
249 |
+
|
250 |
+
opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
|
251 |
+
closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
|
252 |
+
highs_1h = df_30m.groupby('Datetime')['High'].max()
|
253 |
+
lows_1h = df_30m.groupby('Datetime')['Low'].min()
|
254 |
+
|
255 |
+
df_1h = pd.DataFrame(index=df_30m.index.unique())
|
256 |
+
df_1h['Open'] = opens_1h
|
257 |
+
df_1h['Close'] = closes_1h
|
258 |
+
df_1h['High'] = highs_1h
|
259 |
+
df_1h['Low'] = lows_1h
|
260 |
+
|
261 |
+
df_1h.columns = ['Open30','High30','Low30','Close30']
|
262 |
+
|
263 |
+
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
264 |
+
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
265 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
266 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
267 |
+
prices_spx.index = prices_spx['index']
|
268 |
+
prices_spx = prices_spx.drop(columns='index')
|
269 |
+
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
270 |
+
|
271 |
+
|
272 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
273 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
274 |
+
prices_vix.index = prices_vix['index']
|
275 |
+
prices_vix = prices_vix.drop(columns='index')
|
276 |
+
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
277 |
+
|
278 |
+
|
279 |
+
data = prices_spx.merge(df_1h, left_index=True, right_index=True)
|
280 |
+
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
281 |
+
|
282 |
+
# Features
|
283 |
+
data['PrevClose'] = data['Close'].shift(1)
|
284 |
+
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
285 |
+
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
286 |
+
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
287 |
+
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
288 |
+
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
289 |
+
|
290 |
+
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
291 |
+
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
292 |
+
|
293 |
+
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
294 |
+
data['RangePct'] = data['Range'] / data['Close']
|
295 |
+
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
296 |
+
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
297 |
+
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
298 |
+
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
299 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
300 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
301 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
302 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
303 |
+
data['RangePct_n1'] = data['RangePct'].shift(1)
|
304 |
+
data['RangePct_n2'] = data['RangePct'].shift(2)
|
305 |
+
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
306 |
+
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
307 |
+
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
308 |
+
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
309 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
310 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
311 |
+
|
312 |
+
# Intraday features
|
313 |
+
data['CurrentHigh30'] = data['High30'].shift(-1)
|
314 |
+
data['CurrentLow30'] = data['Low30'].shift(-1)
|
315 |
+
data['CurrentClose30'] = data['Close30'].shift(-1)
|
316 |
+
|
317 |
+
# Open to High
|
318 |
+
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
319 |
+
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
320 |
+
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
321 |
+
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
322 |
+
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
323 |
+
|
324 |
+
# Target -- the next day's low
|
325 |
+
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
326 |
+
data['Target'] = data['Target'].shift(-1)
|
327 |
+
# data['Target'] = data['RangePct'].shift(-1)
|
328 |
+
|
329 |
+
# Target for clf -- whether tomorrow will close above or below today's close
|
330 |
+
data['Target_clf'] = data['Close'] > data['PrevClose']
|
331 |
+
data['Target_clf'] = data['Target_clf'].shift(-1)
|
332 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
333 |
+
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
334 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
335 |
+
|
336 |
+
for rid in tqdm(release_ids, desc='Merging econ data'):
|
337 |
+
# Get the name of the release
|
338 |
+
n = releases[rid]['name']
|
339 |
+
# Merge the corresponding DF of the release
|
340 |
+
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
341 |
+
# Create a column that shifts the value in the merged column up by 1
|
342 |
+
data[f'{n}_shift'] = data[n].shift(-1)
|
343 |
+
# Fill the rest with zeroes
|
344 |
+
data[n] = data[n].fillna(0)
|
345 |
+
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
346 |
+
|
347 |
+
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
348 |
+
|
349 |
+
def cumul_sum(col):
|
350 |
+
nums = []
|
351 |
+
s = 0
|
352 |
+
for x in col:
|
353 |
+
if x == 1:
|
354 |
+
s += 1
|
355 |
+
elif x == 0:
|
356 |
+
s = 0
|
357 |
+
nums.append(s)
|
358 |
+
return nums
|
359 |
+
|
360 |
+
consec_green = cumul_sum(data['GreenDay'].values)
|
361 |
+
consec_red = cumul_sum(data['RedDay'].values)
|
362 |
+
|
363 |
+
data['DaysGreen'] = consec_green
|
364 |
+
data['DaysRed'] = consec_red
|
365 |
+
|
366 |
+
final_row = data.index[-2]
|
367 |
+
|
368 |
+
exp_row = data.index[-1]
|
369 |
+
|
370 |
+
df_final = data.loc[:final_row,
|
371 |
+
[
|
372 |
+
'BigNewsDay',
|
373 |
+
'Quarter',
|
374 |
+
'Perf5Day',
|
375 |
+
'Perf5Day_n1',
|
376 |
+
'DaysGreen',
|
377 |
+
'DaysRed',
|
378 |
+
'CurrentHigh30toClose',
|
379 |
+
'CurrentLow30toClose',
|
380 |
+
'CurrentClose30toClose',
|
381 |
+
'CurrentRange30',
|
382 |
+
'GapFill30',
|
383 |
+
# 'OHLC4_Trend',
|
384 |
+
# 'OHLC4_Trend_n1',
|
385 |
+
# 'OHLC4_Trend_n2',
|
386 |
+
# 'VIX5Day',
|
387 |
+
# 'VIX5Day_n1',
|
388 |
+
'CurrentGap',
|
389 |
+
'RangePct',
|
390 |
+
'RangePct_n1',
|
391 |
+
'RangePct_n2',
|
392 |
+
'OHLC4_VIX',
|
393 |
+
'OHLC4_VIX_n1',
|
394 |
+
'OHLC4_VIX_n2',
|
395 |
+
'Target',
|
396 |
+
'Target_clf'
|
397 |
+
]]
|
398 |
+
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
399 |
+
return data, df_final, final_row
|
model_30m.py
ADDED
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import pandas_datareader as pdr
|
4 |
+
import numpy as np
|
5 |
+
import yfinance as yf
|
6 |
+
import json
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from typing import List
|
10 |
+
import xgboost as xgb
|
11 |
+
from tqdm import tqdm
|
12 |
+
from sklearn import linear_model
|
13 |
+
import joblib
|
14 |
+
import os
|
15 |
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
+
import datetime
|
17 |
+
from pandas.tseries.offsets import BDay
|
18 |
+
from datasets import load_dataset
|
19 |
+
|
20 |
+
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
21 |
+
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
22 |
+
|
23 |
+
# Create an XGBRegressor model
|
24 |
+
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
25 |
+
model = linear_model.LinearRegression()
|
26 |
+
|
27 |
+
overall_results = []
|
28 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
29 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
30 |
+
# Split the data into training and test sets
|
31 |
+
X_train = df.drop(target_column, axis=1).iloc[:i]
|
32 |
+
y_train = df[target_column].iloc[:i]
|
33 |
+
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
34 |
+
y_test = df[target_column].iloc[i:i+num_periods]
|
35 |
+
|
36 |
+
# Fit the model to the training data
|
37 |
+
model.fit(X_train, y_train)
|
38 |
+
|
39 |
+
# Make a prediction on the test data
|
40 |
+
predictions = model.predict(X_test)
|
41 |
+
|
42 |
+
# Create a DataFrame to store the true and predicted values
|
43 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
44 |
+
|
45 |
+
overall_results.append(result_df)
|
46 |
+
|
47 |
+
df_results = pd.concat(overall_results)
|
48 |
+
# model.save_model('model_lr.bin')
|
49 |
+
# Return the true and predicted values, and fitted model
|
50 |
+
return df_results, model
|
51 |
+
|
52 |
+
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
53 |
+
|
54 |
+
# Create run the regression model to get its target
|
55 |
+
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
56 |
+
# joblib.dump(model1, 'model1.bin')
|
57 |
+
|
58 |
+
# Merge the result df back on the df for feeding into the classifier
|
59 |
+
for_merge = res[['Predicted']]
|
60 |
+
for_merge.columns = ['RegrModelOut']
|
61 |
+
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
62 |
+
df = df.merge(for_merge, left_index=True, right_index=True)
|
63 |
+
df = df.drop(columns=[target_column_regr])
|
64 |
+
df = df[[
|
65 |
+
'CurrentGap','RegrModelOut','CurrentHigh30toClose',
|
66 |
+
'CurrentLow30toClose',
|
67 |
+
'CurrentClose30toClose',
|
68 |
+
'CurrentRange30',
|
69 |
+
'GapFill30', target_column_clf
|
70 |
+
]]
|
71 |
+
|
72 |
+
df[target_column_clf] = df[target_column_clf].astype(bool)
|
73 |
+
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
74 |
+
|
75 |
+
# Create an XGBRegressor model
|
76 |
+
model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
77 |
+
# model = linear_model.LogisticRegression(max_iter=1500)
|
78 |
+
|
79 |
+
overall_results = []
|
80 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
81 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
82 |
+
# Split the data into training and test sets
|
83 |
+
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
84 |
+
y_train = df[target_column_clf].iloc[:i]
|
85 |
+
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
86 |
+
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
87 |
+
|
88 |
+
# Fit the model to the training data
|
89 |
+
model2.fit(X_train, y_train)
|
90 |
+
|
91 |
+
# Make a prediction on the test data
|
92 |
+
predictions = model2.predict_proba(X_test)[:,-1]
|
93 |
+
|
94 |
+
# Create a DataFrame to store the true and predicted values
|
95 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
96 |
+
|
97 |
+
overall_results.append(result_df)
|
98 |
+
|
99 |
+
df_results = pd.concat(overall_results)
|
100 |
+
return df_results, model1, model2
|
101 |
+
|
102 |
+
|
103 |
+
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
104 |
+
regr_pred = trained_reg_model.predict(df)
|
105 |
+
regr_pred = regr_pred > 0
|
106 |
+
new_df = df.copy()
|
107 |
+
new_df['RegrModelOut'] = regr_pred
|
108 |
+
clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut','CurrentHigh30toClose',
|
109 |
+
'CurrentLow30toClose',
|
110 |
+
'CurrentClose30toClose',
|
111 |
+
'CurrentRange30',
|
112 |
+
'GapFill30']])[:,-1]
|
113 |
+
return clf_pred_proba
|
114 |
+
|
115 |
+
def get_data():
|
116 |
+
# f = open('settings.json')
|
117 |
+
# j = json.load(f)
|
118 |
+
# API_KEY_FRED = j["API_KEY_FRED"]
|
119 |
+
|
120 |
+
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
121 |
+
|
122 |
+
def parse_release_dates(release_id: str) -> List[str]:
|
123 |
+
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
124 |
+
r = requests.get(release_dates_url)
|
125 |
+
text = r.text
|
126 |
+
soup = BeautifulSoup(text, 'xml')
|
127 |
+
dates = []
|
128 |
+
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
129 |
+
dates.append(release_date_tag.text)
|
130 |
+
return dates
|
131 |
+
|
132 |
+
def parse_release_dates_obs(series_id: str) -> List[str]:
|
133 |
+
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
134 |
+
r = requests.get(obs_url)
|
135 |
+
text = r.text
|
136 |
+
soup = BeautifulSoup(text, 'xml')
|
137 |
+
observations = []
|
138 |
+
for observation_tag in soup.find_all('observation'):
|
139 |
+
date = observation_tag.get('date')
|
140 |
+
value = observation_tag.get('value')
|
141 |
+
observations.append((date, value))
|
142 |
+
return observations
|
143 |
+
|
144 |
+
econ_dfs = {}
|
145 |
+
|
146 |
+
econ_tickers = [
|
147 |
+
'WALCL',
|
148 |
+
'NFCI',
|
149 |
+
'WRESBAL'
|
150 |
+
]
|
151 |
+
|
152 |
+
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
153 |
+
# p = parse_release_dates_obs(et)
|
154 |
+
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
155 |
+
df = pdr.get_data_fred(et)
|
156 |
+
df.index = df.index.rename('ds')
|
157 |
+
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
158 |
+
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
159 |
+
econ_dfs[et] = df
|
160 |
+
|
161 |
+
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
162 |
+
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
163 |
+
|
164 |
+
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
165 |
+
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
166 |
+
|
167 |
+
release_ids = [
|
168 |
+
"10", # "Consumer Price Index"
|
169 |
+
"46", # "Producer Price Index"
|
170 |
+
"50", # "Employment Situation"
|
171 |
+
"53", # "Gross Domestic Product"
|
172 |
+
"103", # "Discount Rate Meeting Minutes"
|
173 |
+
"180", # "Unemployment Insurance Weekly Claims Report"
|
174 |
+
"194", # "ADP National Employment Report"
|
175 |
+
"323" # "Trimmed Mean PCE Inflation Rate"
|
176 |
+
]
|
177 |
+
|
178 |
+
release_names = [
|
179 |
+
"CPI",
|
180 |
+
"PPI",
|
181 |
+
"NFP",
|
182 |
+
"GDP",
|
183 |
+
"FOMC",
|
184 |
+
"UNEMP",
|
185 |
+
"ADP",
|
186 |
+
"PCE"
|
187 |
+
]
|
188 |
+
|
189 |
+
releases = {}
|
190 |
+
|
191 |
+
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
192 |
+
releases[rid] = {}
|
193 |
+
releases[rid]['dates'] = parse_release_dates(rid)
|
194 |
+
releases[rid]['name'] = n
|
195 |
+
|
196 |
+
# Create a DF that has all dates with the name of the col as 1
|
197 |
+
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
198 |
+
# This column serves as the true/false indicator of whether there was economic data released that day.
|
199 |
+
for rid in tqdm(release_ids, desc='Making indicators'):
|
200 |
+
releases[rid]['df'] = pd.DataFrame(
|
201 |
+
index=releases[rid]['dates'],
|
202 |
+
data={
|
203 |
+
releases[rid]['name']: 1
|
204 |
+
})
|
205 |
+
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
206 |
+
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
207 |
+
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
208 |
+
|
209 |
+
vix = yf.Ticker('^VIX')
|
210 |
+
spx = yf.Ticker('^GSPC')
|
211 |
+
|
212 |
+
# Pull in data
|
213 |
+
data = load_dataset("boomsss/SPX_full_30min", split='train')
|
214 |
+
|
215 |
+
rows = [d['text'] for d in data]
|
216 |
+
rows = [x.split(',') for x in rows]
|
217 |
+
|
218 |
+
fr = pd.DataFrame(columns=[
|
219 |
+
'Datetime','Open','High','Low','Close'
|
220 |
+
], data = rows)
|
221 |
+
|
222 |
+
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
223 |
+
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
224 |
+
fr = fr.set_index('Datetime')
|
225 |
+
fr['Open'] = pd.to_numeric(fr['Open'])
|
226 |
+
fr['High'] = pd.to_numeric(fr['High'])
|
227 |
+
fr['Low'] = pd.to_numeric(fr['Low'])
|
228 |
+
fr['Close'] = pd.to_numeric(fr['Close'])
|
229 |
+
|
230 |
+
# Set index for ready to concat
|
231 |
+
|
232 |
+
|
233 |
+
# Get incremental date
|
234 |
+
last_date = fr.index.date[-1]
|
235 |
+
last_date = last_date + datetime.timedelta(days=1)
|
236 |
+
# Get incremental data
|
237 |
+
spx1 = yf.Ticker('^GSPC')
|
238 |
+
yfp = spx1.history(start=last_date, interval='30m')
|
239 |
+
# Concat current and incremental
|
240 |
+
df_30m = pd.concat([fr, yfp])
|
241 |
+
# Get the first 30 minute bar
|
242 |
+
df_30m = df_30m.reset_index()
|
243 |
+
df_30m['Datetime'] = df_30m['Datetime'].dt.date
|
244 |
+
df_30m = df_30m.groupby('Datetime').head(1)
|
245 |
+
df_30m = df_30m.set_index('Datetime',drop=True)
|
246 |
+
# Rename the columns
|
247 |
+
df_30m = df_30m[['Open','High','Low','Close']]
|
248 |
+
df_30m.columns = ['Open30','High30','Low30','Close30']
|
249 |
+
|
250 |
+
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
251 |
+
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
252 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
253 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
254 |
+
prices_spx.index = prices_spx['index']
|
255 |
+
prices_spx = prices_spx.drop(columns='index')
|
256 |
+
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
257 |
+
|
258 |
+
|
259 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
260 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
261 |
+
prices_vix.index = prices_vix['index']
|
262 |
+
prices_vix = prices_vix.drop(columns='index')
|
263 |
+
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
264 |
+
|
265 |
+
|
266 |
+
data = prices_spx.merge(df_30m, left_index=True, right_index=True)
|
267 |
+
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
268 |
+
|
269 |
+
# Features
|
270 |
+
data['PrevClose'] = data['Close'].shift(1)
|
271 |
+
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
272 |
+
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
273 |
+
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
274 |
+
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
275 |
+
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
276 |
+
|
277 |
+
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
278 |
+
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
279 |
+
|
280 |
+
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
281 |
+
data['RangePct'] = data['Range'] / data['Close']
|
282 |
+
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
283 |
+
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
284 |
+
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
285 |
+
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
286 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
287 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
288 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
289 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
290 |
+
data['RangePct_n1'] = data['RangePct'].shift(1)
|
291 |
+
data['RangePct_n2'] = data['RangePct'].shift(2)
|
292 |
+
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
293 |
+
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
294 |
+
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
295 |
+
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
296 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
297 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
298 |
+
|
299 |
+
# Intraday features
|
300 |
+
data['CurrentHigh30'] = data['High30'].shift(-1)
|
301 |
+
data['CurrentLow30'] = data['Low30'].shift(-1)
|
302 |
+
data['CurrentClose30'] = data['Close30'].shift(-1)
|
303 |
+
|
304 |
+
# Open to High
|
305 |
+
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
306 |
+
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
307 |
+
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
308 |
+
|
309 |
+
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
310 |
+
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
311 |
+
|
312 |
+
# Target -- the next day's low
|
313 |
+
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
314 |
+
data['Target'] = data['Target'].shift(-1)
|
315 |
+
# data['Target'] = data['RangePct'].shift(-1)
|
316 |
+
|
317 |
+
# Target for clf -- whether tomorrow will close above or below today's close
|
318 |
+
data['Target_clf'] = data['Close'] > data['PrevClose']
|
319 |
+
data['Target_clf'] = data['Target_clf'].shift(-1)
|
320 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
321 |
+
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
322 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
323 |
+
|
324 |
+
for rid in tqdm(release_ids, desc='Merging econ data'):
|
325 |
+
# Get the name of the release
|
326 |
+
n = releases[rid]['name']
|
327 |
+
# Merge the corresponding DF of the release
|
328 |
+
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
329 |
+
# Create a column that shifts the value in the merged column up by 1
|
330 |
+
data[f'{n}_shift'] = data[n].shift(-1)
|
331 |
+
# Fill the rest with zeroes
|
332 |
+
data[n] = data[n].fillna(0)
|
333 |
+
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
334 |
+
|
335 |
+
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
336 |
+
|
337 |
+
def cumul_sum(col):
|
338 |
+
nums = []
|
339 |
+
s = 0
|
340 |
+
for x in col:
|
341 |
+
if x == 1:
|
342 |
+
s += 1
|
343 |
+
elif x == 0:
|
344 |
+
s = 0
|
345 |
+
nums.append(s)
|
346 |
+
return nums
|
347 |
+
|
348 |
+
consec_green = cumul_sum(data['GreenDay'].values)
|
349 |
+
consec_red = cumul_sum(data['RedDay'].values)
|
350 |
+
|
351 |
+
data['DaysGreen'] = consec_green
|
352 |
+
data['DaysRed'] = consec_red
|
353 |
+
|
354 |
+
final_row = data.index[-2]
|
355 |
+
|
356 |
+
exp_row = data.index[-1]
|
357 |
+
|
358 |
+
df_final = data.loc[:final_row,
|
359 |
+
[
|
360 |
+
'BigNewsDay',
|
361 |
+
'Quarter',
|
362 |
+
'Perf5Day',
|
363 |
+
'Perf5Day_n1',
|
364 |
+
'DaysGreen',
|
365 |
+
'DaysRed',
|
366 |
+
'CurrentHigh30toClose',
|
367 |
+
'CurrentLow30toClose',
|
368 |
+
'CurrentClose30toClose',
|
369 |
+
'CurrentRange30',
|
370 |
+
'GapFill30',
|
371 |
+
# 'OHLC4_Trend',
|
372 |
+
# 'OHLC4_Trend_n1',
|
373 |
+
# 'OHLC4_Trend_n2',
|
374 |
+
# 'VIX5Day',
|
375 |
+
# 'VIX5Day_n1',
|
376 |
+
'CurrentGap',
|
377 |
+
'RangePct',
|
378 |
+
'RangePct_n1',
|
379 |
+
'RangePct_n2',
|
380 |
+
'OHLC4_VIX',
|
381 |
+
'OHLC4_VIX_n1',
|
382 |
+
'OHLC4_VIX_n2',
|
383 |
+
'Target',
|
384 |
+
'Target_clf'
|
385 |
+
]]
|
386 |
+
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
387 |
+
return data, df_final, final_row
|
model_day.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import pandas_datareader as pdr
|
4 |
+
import numpy as np
|
5 |
+
import yfinance as yf
|
6 |
+
import json
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from typing import List
|
10 |
+
import xgboost as xgb
|
11 |
+
from tqdm import tqdm
|
12 |
+
from sklearn import linear_model
|
13 |
+
import joblib
|
14 |
+
import os
|
15 |
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
+
import datetime
|
17 |
+
from pandas.tseries.offsets import BDay
|
18 |
+
from datasets import load_dataset
|
19 |
+
|
20 |
+
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
21 |
+
dataset = load_dataset("boomsss/SPX_full_30min", split="train")
|
22 |
+
|
23 |
+
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
24 |
+
|
25 |
+
# Create an XGBRegressor model
|
26 |
+
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
27 |
+
model = linear_model.LinearRegression()
|
28 |
+
|
29 |
+
overall_results = []
|
30 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
31 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
32 |
+
# Split the data into training and test sets
|
33 |
+
X_train = df.drop(target_column, axis=1).iloc[:i]
|
34 |
+
y_train = df[target_column].iloc[:i]
|
35 |
+
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
36 |
+
y_test = df[target_column].iloc[i:i+num_periods]
|
37 |
+
|
38 |
+
# Fit the model to the training data
|
39 |
+
model.fit(X_train, y_train)
|
40 |
+
|
41 |
+
# Make a prediction on the test data
|
42 |
+
predictions = model.predict(X_test)
|
43 |
+
|
44 |
+
# Create a DataFrame to store the true and predicted values
|
45 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
46 |
+
|
47 |
+
overall_results.append(result_df)
|
48 |
+
|
49 |
+
df_results = pd.concat(overall_results)
|
50 |
+
# model.save_model('model_lr.bin')
|
51 |
+
# Return the true and predicted values, and fitted model
|
52 |
+
return df_results, model
|
53 |
+
|
54 |
+
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
55 |
+
|
56 |
+
# Create run the regression model to get its target
|
57 |
+
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
58 |
+
# joblib.dump(model1, 'model1.bin')
|
59 |
+
|
60 |
+
# Merge the result df back on the df for feeding into the classifier
|
61 |
+
for_merge = res[['Predicted']]
|
62 |
+
for_merge.columns = ['RegrModelOut']
|
63 |
+
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
64 |
+
df = df.merge(for_merge, left_index=True, right_index=True)
|
65 |
+
df = df.drop(columns=[target_column_regr])
|
66 |
+
df = df[[
|
67 |
+
'CurrentGap','RegrModelOut',target_column_clf
|
68 |
+
]]
|
69 |
+
|
70 |
+
df[target_column_clf] = df[target_column_clf].astype(bool)
|
71 |
+
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
72 |
+
|
73 |
+
# Create an XGBRegressor model
|
74 |
+
model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
75 |
+
# model = linear_model.LogisticRegression(max_iter=1500)
|
76 |
+
|
77 |
+
overall_results = []
|
78 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
79 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
80 |
+
# Split the data into training and test sets
|
81 |
+
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
82 |
+
y_train = df[target_column_clf].iloc[:i]
|
83 |
+
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
84 |
+
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
85 |
+
|
86 |
+
# Fit the model to the training data
|
87 |
+
model2.fit(X_train, y_train)
|
88 |
+
|
89 |
+
# Make a prediction on the test data
|
90 |
+
predictions = model2.predict_proba(X_test)[:,-1]
|
91 |
+
|
92 |
+
# Create a DataFrame to store the true and predicted values
|
93 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
94 |
+
|
95 |
+
overall_results.append(result_df)
|
96 |
+
|
97 |
+
df_results = pd.concat(overall_results)
|
98 |
+
# model1.save_model('model_ensemble.bin')
|
99 |
+
# joblib.dump(model2, 'model2.bin')
|
100 |
+
# Return the true and predicted values, and fitted model
|
101 |
+
return df_results, model1, model2
|
102 |
+
|
103 |
+
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
104 |
+
regr_pred = trained_reg_model.predict(df)
|
105 |
+
regr_pred = regr_pred > 0
|
106 |
+
new_df = df.copy()
|
107 |
+
new_df['RegrModelOut'] = regr_pred
|
108 |
+
clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
|
109 |
+
return clf_pred_proba
|
110 |
+
|
111 |
+
def get_data():
|
112 |
+
# f = open('settings.json')
|
113 |
+
# j = json.load(f)
|
114 |
+
# API_KEY_FRED = j["API_KEY_FRED"]
|
115 |
+
|
116 |
+
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
117 |
+
|
118 |
+
def parse_release_dates(release_id: str) -> List[str]:
|
119 |
+
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
120 |
+
r = requests.get(release_dates_url)
|
121 |
+
text = r.text
|
122 |
+
soup = BeautifulSoup(text, 'xml')
|
123 |
+
dates = []
|
124 |
+
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
125 |
+
dates.append(release_date_tag.text)
|
126 |
+
return dates
|
127 |
+
|
128 |
+
def parse_release_dates_obs(series_id: str) -> List[str]:
|
129 |
+
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
130 |
+
r = requests.get(obs_url)
|
131 |
+
text = r.text
|
132 |
+
soup = BeautifulSoup(text, 'xml')
|
133 |
+
observations = []
|
134 |
+
for observation_tag in soup.find_all('observation'):
|
135 |
+
date = observation_tag.get('date')
|
136 |
+
value = observation_tag.get('value')
|
137 |
+
observations.append((date, value))
|
138 |
+
return observations
|
139 |
+
|
140 |
+
econ_dfs = {}
|
141 |
+
|
142 |
+
econ_tickers = [
|
143 |
+
'WALCL',
|
144 |
+
'NFCI',
|
145 |
+
'WRESBAL'
|
146 |
+
]
|
147 |
+
|
148 |
+
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
149 |
+
# p = parse_release_dates_obs(et)
|
150 |
+
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
151 |
+
df = pdr.get_data_fred(et)
|
152 |
+
df.index = df.index.rename('ds')
|
153 |
+
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
154 |
+
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
155 |
+
econ_dfs[et] = df
|
156 |
+
|
157 |
+
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
158 |
+
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
159 |
+
|
160 |
+
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
161 |
+
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
162 |
+
|
163 |
+
release_ids = [
|
164 |
+
"10", # "Consumer Price Index"
|
165 |
+
"46", # "Producer Price Index"
|
166 |
+
"50", # "Employment Situation"
|
167 |
+
"53", # "Gross Domestic Product"
|
168 |
+
"103", # "Discount Rate Meeting Minutes"
|
169 |
+
"180", # "Unemployment Insurance Weekly Claims Report"
|
170 |
+
"194", # "ADP National Employment Report"
|
171 |
+
"323" # "Trimmed Mean PCE Inflation Rate"
|
172 |
+
]
|
173 |
+
|
174 |
+
release_names = [
|
175 |
+
"CPI",
|
176 |
+
"PPI",
|
177 |
+
"NFP",
|
178 |
+
"GDP",
|
179 |
+
"FOMC",
|
180 |
+
"UNEMP",
|
181 |
+
"ADP",
|
182 |
+
"PCE"
|
183 |
+
]
|
184 |
+
|
185 |
+
releases = {}
|
186 |
+
|
187 |
+
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
188 |
+
releases[rid] = {}
|
189 |
+
releases[rid]['dates'] = parse_release_dates(rid)
|
190 |
+
releases[rid]['name'] = n
|
191 |
+
|
192 |
+
# Create a DF that has all dates with the name of the col as 1
|
193 |
+
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
194 |
+
# This column serves as the true/false indicator of whether there was economic data released that day.
|
195 |
+
for rid in tqdm(release_ids, desc='Making indicators'):
|
196 |
+
releases[rid]['df'] = pd.DataFrame(
|
197 |
+
index=releases[rid]['dates'],
|
198 |
+
data={
|
199 |
+
releases[rid]['name']: 1
|
200 |
+
})
|
201 |
+
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
202 |
+
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
203 |
+
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
204 |
+
|
205 |
+
vix = yf.Ticker('^VIX')
|
206 |
+
spx = yf.Ticker('^GSPC')
|
207 |
+
|
208 |
+
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
209 |
+
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
210 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
211 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
212 |
+
prices_spx.index = prices_spx['index']
|
213 |
+
prices_spx = prices_spx.drop(columns='index')
|
214 |
+
|
215 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
216 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
217 |
+
prices_vix.index = prices_vix['index']
|
218 |
+
prices_vix = prices_vix.drop(columns='index')
|
219 |
+
|
220 |
+
data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
221 |
+
data.index = pd.DatetimeIndex(data.index)
|
222 |
+
|
223 |
+
# Features
|
224 |
+
data['PrevClose'] = data['Close'].shift(1)
|
225 |
+
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
226 |
+
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
227 |
+
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
228 |
+
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
229 |
+
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
230 |
+
|
231 |
+
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
232 |
+
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
233 |
+
|
234 |
+
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
235 |
+
data['RangePct'] = data['Range'] / data['Close']
|
236 |
+
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
237 |
+
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
238 |
+
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
239 |
+
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
240 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
241 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
242 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
243 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
244 |
+
data['RangePct_n1'] = data['RangePct'].shift(1)
|
245 |
+
data['RangePct_n2'] = data['RangePct'].shift(2)
|
246 |
+
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
247 |
+
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
248 |
+
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
249 |
+
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
250 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
251 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
252 |
+
|
253 |
+
# Target -- the next day's low
|
254 |
+
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
255 |
+
data['Target'] = data['Target'].shift(-1)
|
256 |
+
# data['Target'] = data['RangePct'].shift(-1)
|
257 |
+
|
258 |
+
# Target for clf -- whether tomorrow will close above or below today's close
|
259 |
+
data['Target_clf'] = data['Close'] > data['PrevClose']
|
260 |
+
data['Target_clf'] = data['Target_clf'].shift(-1)
|
261 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
262 |
+
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
263 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
264 |
+
|
265 |
+
for rid in tqdm(release_ids, desc='Merging econ data'):
|
266 |
+
# Get the name of the release
|
267 |
+
n = releases[rid]['name']
|
268 |
+
# Merge the corresponding DF of the release
|
269 |
+
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
270 |
+
# Create a column that shifts the value in the merged column up by 1
|
271 |
+
data[f'{n}_shift'] = data[n].shift(-1)
|
272 |
+
# Fill the rest with zeroes
|
273 |
+
data[n] = data[n].fillna(0)
|
274 |
+
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
275 |
+
|
276 |
+
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
277 |
+
|
278 |
+
def cumul_sum(col):
|
279 |
+
nums = []
|
280 |
+
s = 0
|
281 |
+
for x in col:
|
282 |
+
if x == 1:
|
283 |
+
s += 1
|
284 |
+
elif x == 0:
|
285 |
+
s = 0
|
286 |
+
nums.append(s)
|
287 |
+
return nums
|
288 |
+
|
289 |
+
consec_green = cumul_sum(data['GreenDay'].values)
|
290 |
+
consec_red = cumul_sum(data['RedDay'].values)
|
291 |
+
|
292 |
+
data['DaysGreen'] = consec_green
|
293 |
+
data['DaysRed'] = consec_red
|
294 |
+
|
295 |
+
final_row = data.index[-2]
|
296 |
+
|
297 |
+
exp_row = data.index[-1]
|
298 |
+
|
299 |
+
df_final = data.loc[:final_row,
|
300 |
+
[
|
301 |
+
'BigNewsDay',
|
302 |
+
'Quarter',
|
303 |
+
'Perf5Day',
|
304 |
+
'Perf5Day_n1',
|
305 |
+
'DaysGreen',
|
306 |
+
'DaysRed',
|
307 |
+
# 'OHLC4_Trend',
|
308 |
+
# 'OHLC4_Trend_n1',
|
309 |
+
# 'OHLC4_Trend_n2',
|
310 |
+
# 'VIX5Day',
|
311 |
+
# 'VIX5Day_n1',
|
312 |
+
'CurrentGap',
|
313 |
+
'RangePct',
|
314 |
+
'RangePct_n1',
|
315 |
+
'RangePct_n2',
|
316 |
+
'OHLC4_VIX',
|
317 |
+
'OHLC4_VIX_n1',
|
318 |
+
'OHLC4_VIX_n2',
|
319 |
+
'Target',
|
320 |
+
'Target_clf'
|
321 |
+
]]
|
322 |
+
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
323 |
+
return data, df_final, final_row
|