wnstnb commited on
Commit
2647e65
·
1 Parent(s): 7fe1620

adding intraday models

Browse files
Files changed (4) hide show
  1. app.py +382 -347
  2. model_1h.py +399 -0
  3. model_30m.py +387 -0
  4. model_day.py +323 -0
app.py CHANGED
@@ -1,326 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
  from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
  from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
-
20
- # If the dataset is gated/private, make sure you have run huggingface-cli login
21
- dataset = load_dataset("boomsss/SPX_full_30min", split="train")
22
-
23
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
24
-
25
- # Create an XGBRegressor model
26
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
27
- model = linear_model.LinearRegression()
28
-
29
- overall_results = []
30
- # Iterate over the rows in the DataFrame, one step at a time
31
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
32
- # Split the data into training and test sets
33
- X_train = df.drop(target_column, axis=1).iloc[:i]
34
- y_train = df[target_column].iloc[:i]
35
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
36
- y_test = df[target_column].iloc[i:i+num_periods]
37
-
38
- # Fit the model to the training data
39
- model.fit(X_train, y_train)
40
-
41
- # Make a prediction on the test data
42
- predictions = model.predict(X_test)
43
-
44
- # Create a DataFrame to store the true and predicted values
45
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
46
-
47
- overall_results.append(result_df)
48
-
49
- df_results = pd.concat(overall_results)
50
- # model.save_model('model_lr.bin')
51
- # Return the true and predicted values, and fitted model
52
- return df_results, model
53
-
54
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
55
-
56
- # Create run the regression model to get its target
57
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
58
- # joblib.dump(model1, 'model1.bin')
59
-
60
- # Merge the result df back on the df for feeding into the classifier
61
- for_merge = res[['Predicted']]
62
- for_merge.columns = ['RegrModelOut']
63
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
64
- df = df.merge(for_merge, left_index=True, right_index=True)
65
- df = df.drop(columns=[target_column_regr])
66
- df = df[[
67
- 'CurrentGap','RegrModelOut',target_column_clf
68
- ]]
69
-
70
- df[target_column_clf] = df[target_column_clf].astype(bool)
71
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
72
-
73
- # Create an XGBRegressor model
74
- model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
75
- # model = linear_model.LogisticRegression(max_iter=1500)
76
-
77
- overall_results = []
78
- # Iterate over the rows in the DataFrame, one step at a time
79
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
80
- # Split the data into training and test sets
81
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
82
- y_train = df[target_column_clf].iloc[:i]
83
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
84
- y_test = df[target_column_clf].iloc[i:i+num_periods]
85
-
86
- # Fit the model to the training data
87
- model2.fit(X_train, y_train)
88
-
89
- # Make a prediction on the test data
90
- predictions = model2.predict_proba(X_test)[:,-1]
91
-
92
- # Create a DataFrame to store the true and predicted values
93
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
94
-
95
- overall_results.append(result_df)
96
-
97
- df_results = pd.concat(overall_results)
98
- # model1.save_model('model_ensemble.bin')
99
- # joblib.dump(model2, 'model2.bin')
100
- # Return the true and predicted values, and fitted model
101
- return df_results, model1, model2
102
-
103
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
- regr_pred = trained_reg_model.predict(df)
105
- regr_pred = regr_pred > 0
106
- new_df = df.copy()
107
- new_df['RegrModelOut'] = regr_pred
108
- clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
109
- return clf_pred_proba
110
-
111
- def get_data():
112
- # f = open('settings.json')
113
- # j = json.load(f)
114
- # API_KEY_FRED = j["API_KEY_FRED"]
115
-
116
- API_KEY_FRED = os.getenv('API_KEY_FRED')
117
-
118
- def parse_release_dates(release_id: str) -> List[str]:
119
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
120
- r = requests.get(release_dates_url)
121
- text = r.text
122
- soup = BeautifulSoup(text, 'xml')
123
- dates = []
124
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
125
- dates.append(release_date_tag.text)
126
- return dates
127
-
128
- def parse_release_dates_obs(series_id: str) -> List[str]:
129
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
130
- r = requests.get(obs_url)
131
- text = r.text
132
- soup = BeautifulSoup(text, 'xml')
133
- observations = []
134
- for observation_tag in soup.find_all('observation'):
135
- date = observation_tag.get('date')
136
- value = observation_tag.get('value')
137
- observations.append((date, value))
138
- return observations
139
-
140
- econ_dfs = {}
141
-
142
- econ_tickers = [
143
- 'WALCL',
144
- 'NFCI',
145
- 'WRESBAL'
146
- ]
147
-
148
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
149
- # p = parse_release_dates_obs(et)
150
- # df = pd.DataFrame(columns = ['ds',et], data = p)
151
- df = pdr.get_data_fred(et)
152
- df.index = df.index.rename('ds')
153
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
154
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
155
- econ_dfs[et] = df
156
-
157
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
158
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
159
-
160
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
161
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
162
-
163
- release_ids = [
164
- "10", # "Consumer Price Index"
165
- "46", # "Producer Price Index"
166
- "50", # "Employment Situation"
167
- "53", # "Gross Domestic Product"
168
- "103", # "Discount Rate Meeting Minutes"
169
- "180", # "Unemployment Insurance Weekly Claims Report"
170
- "194", # "ADP National Employment Report"
171
- "323" # "Trimmed Mean PCE Inflation Rate"
172
- ]
173
-
174
- release_names = [
175
- "CPI",
176
- "PPI",
177
- "NFP",
178
- "GDP",
179
- "FOMC",
180
- "UNEMP",
181
- "ADP",
182
- "PCE"
183
- ]
184
-
185
- releases = {}
186
-
187
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
188
- releases[rid] = {}
189
- releases[rid]['dates'] = parse_release_dates(rid)
190
- releases[rid]['name'] = n
191
-
192
- # Create a DF that has all dates with the name of the col as 1
193
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
194
- # This column serves as the true/false indicator of whether there was economic data released that day.
195
- for rid in tqdm(release_ids, desc='Making indicators'):
196
- releases[rid]['df'] = pd.DataFrame(
197
- index=releases[rid]['dates'],
198
- data={
199
- releases[rid]['name']: 1
200
- })
201
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
202
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
203
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
204
-
205
- vix = yf.Ticker('^VIX')
206
- spx = yf.Ticker('^GSPC')
207
-
208
- prices_vix = vix.history(start='2018-07-01', interval='1d')
209
- prices_spx = spx.history(start='2018-07-01', interval='1d')
210
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
211
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
212
- prices_spx.index = prices_spx['index']
213
- prices_spx = prices_spx.drop(columns='index')
214
-
215
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
216
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
217
- prices_vix.index = prices_vix['index']
218
- prices_vix = prices_vix.drop(columns='index')
219
-
220
- data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
221
- data.index = pd.DatetimeIndex(data.index)
222
-
223
- # Features
224
- data['PrevClose'] = data['Close'].shift(1)
225
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
226
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
227
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
228
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
229
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
230
-
231
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
232
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
233
-
234
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
235
- data['RangePct'] = data['Range'] / data['Close']
236
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
237
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
238
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
239
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
240
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
241
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
242
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
243
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
244
- data['RangePct_n1'] = data['RangePct'].shift(1)
245
- data['RangePct_n2'] = data['RangePct'].shift(2)
246
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
247
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
248
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
249
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
250
- data['DayOfWeek'] = pd.to_datetime(data.index)
251
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
252
-
253
- # Target -- the next day's low
254
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
255
- data['Target'] = data['Target'].shift(-1)
256
- # data['Target'] = data['RangePct'].shift(-1)
257
-
258
- # Target for clf -- whether tomorrow will close above or below today's close
259
- data['Target_clf'] = data['Close'] > data['PrevClose']
260
- data['Target_clf'] = data['Target_clf'].shift(-1)
261
- data['DayOfWeek'] = pd.to_datetime(data.index)
262
- data['Quarter'] = data['DayOfWeek'].dt.quarter
263
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
264
-
265
- for rid in tqdm(release_ids, desc='Merging econ data'):
266
- # Get the name of the release
267
- n = releases[rid]['name']
268
- # Merge the corresponding DF of the release
269
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
270
- # Create a column that shifts the value in the merged column up by 1
271
- data[f'{n}_shift'] = data[n].shift(-1)
272
- # Fill the rest with zeroes
273
- data[n] = data[n].fillna(0)
274
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
275
-
276
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
277
-
278
- def cumul_sum(col):
279
- nums = []
280
- s = 0
281
- for x in col:
282
- if x == 1:
283
- s += 1
284
- elif x == 0:
285
- s = 0
286
- nums.append(s)
287
- return nums
288
-
289
- consec_green = cumul_sum(data['GreenDay'].values)
290
- consec_red = cumul_sum(data['RedDay'].values)
291
-
292
- data['DaysGreen'] = consec_green
293
- data['DaysRed'] = consec_red
294
-
295
- final_row = data.index[-2]
296
-
297
- exp_row = data.index[-1]
298
-
299
- df_final = data.loc[:final_row,
300
- [
301
- 'BigNewsDay',
302
- 'Quarter',
303
- 'Perf5Day',
304
- 'Perf5Day_n1',
305
- 'DaysGreen',
306
- 'DaysRed',
307
- # 'OHLC4_Trend',
308
- # 'OHLC4_Trend_n1',
309
- # 'OHLC4_Trend_n2',
310
- # 'VIX5Day',
311
- # 'VIX5Day_n1',
312
- 'CurrentGap',
313
- 'RangePct',
314
- 'RangePct_n1',
315
- 'RangePct_n2',
316
- 'OHLC4_VIX',
317
- 'OHLC4_VIX_n1',
318
- 'OHLC4_VIX_n2',
319
- 'Target',
320
- 'Target_clf'
321
- ]]
322
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
323
- return data, df_final, final_row
324
 
325
  st.set_page_config(
326
  page_title="Gameday Model for $SPX",
@@ -333,7 +14,8 @@ st.markdown('**PLEASE NOTE:** Model should be run at or after market open. Docum
333
  if st.button("🧹 Clear All"):
334
  st.cache_data.clear()
335
 
336
- if st.button('🤖 Run it'):
 
337
  with st.spinner('Loading data...'):
338
  data, df_final, final_row = get_data()
339
  # st.success("✅ Historical data")
@@ -354,11 +36,6 @@ if st.button('🤖 Run it'):
354
  'Perf5Day_n1',
355
  'DaysGreen',
356
  'DaysRed',
357
- # 'OHLC4_Trend',
358
- # 'OHLC4_Trend_n1',
359
- # 'OHLC4_Trend_n2',
360
- # 'VIX5Day',
361
- # 'VIX5Day_n1',
362
  'CurrentGap',
363
  'RangePct',
364
  'RangePct_n1',
@@ -379,11 +56,6 @@ if st.button('🤖 Run it'):
379
  new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
380
  new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
381
  new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
382
- # new_pred['OHLC4_Trend'] = new_pred['OHLC4_Trend'].astype(float)
383
- # new_pred['OHLC4_Trend_n1'] = new_pred['OHLC4_Trend_n1'].astype(float)
384
- # new_pred['OHLC4_Trend_n2'] = new_pred['OHLC4_Trend_n2'].astype(float)
385
- # new_pred['VIX5Day'] = new_pred['VIX5Day'].astype(bool)
386
- # new_pred['VIX5Day_n1'] = new_pred['VIX5Day_n1'].astype(bool)
387
  new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
388
  new_pred['RangePct'] = new_pred['RangePct'].astype(float)
389
  new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
@@ -396,12 +68,7 @@ if st.button('🤖 Run it'):
396
  tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
397
 
398
  seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
399
- # above_pct_green = res1.loc[res1['Predicted'] >= seq_proba, 'True'].mean()
400
- # len_above_pct_green = len(res1.loc[res1['Predicted'] >= seq_proba])
401
- # below_pct_red = 1 - res1.loc[res1['Predicted'] <= seq_proba, 'True'].mean()
402
- # len_below_pct_red = len(res1.loc[res1['Predicted'] <= seq_proba])
403
 
404
- # Calc green and red probas
405
  green_proba = seq_proba[0]
406
  red_proba = 1 - green_proba
407
  do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
@@ -514,8 +181,7 @@ if st.button('🤖 Run it'):
514
  perf_daily = res1.copy()
515
  perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
516
 
517
-
518
- tab1.subheader(f'Pred for {curr_date}')
519
  tab1.write(results)
520
  tab1.write(df_probas)
521
 
@@ -528,16 +194,385 @@ if st.button('🤖 Run it'):
528
  tab4.subheader('Performance')
529
  tab4.write(df_performance)
530
  tab4.write(perf_daily)
531
- tab4.write(dataset)
532
 
533
- # The only variable you can play with as the other ones are historical
534
- # new_pred.loc[:,'CurrentGap'] = -0.01 / 100
535
- # new_pred.loc[:,'BigNewsDay'] = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
537
- # st.subheader('Subset')
538
- # st.write(data.iloc[-1])
 
 
 
 
 
 
 
 
 
 
539
 
540
- # st.subheader('Number of pickups by hour')
541
- # hist_values = np.histogram(
542
- # data[DATE_COLUMN].dt.hour, bins=24, range=(0,24))[0]
543
- # st.bar_chart(hist_values)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
3
  from sklearn.metrics import roc_auc_score, precision_score, recall_score
 
4
  from pandas.tseries.offsets import BDay
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  st.set_page_config(
7
  page_title="Gameday Model for $SPX",
 
14
  if st.button("🧹 Clear All"):
15
  st.cache_data.clear()
16
 
17
+ if st.button('🌞 At Open'):
18
+ from model_day import *
19
  with st.spinner('Loading data...'):
20
  data, df_final, final_row = get_data()
21
  # st.success("✅ Historical data")
 
36
  'Perf5Day_n1',
37
  'DaysGreen',
38
  'DaysRed',
 
 
 
 
 
39
  'CurrentGap',
40
  'RangePct',
41
  'RangePct_n1',
 
56
  new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
57
  new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
58
  new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
 
 
 
 
 
59
  new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
60
  new_pred['RangePct'] = new_pred['RangePct'].astype(float)
61
  new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
 
68
  tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
69
 
70
  seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
 
 
 
 
71
 
 
72
  green_proba = seq_proba[0]
73
  red_proba = 1 - green_proba
74
  do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
 
181
  perf_daily = res1.copy()
182
  perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
183
 
184
+ tab1.subheader(f'Pred for {curr_date} as of 6:30AM PST')
 
185
  tab1.write(results)
186
  tab1.write(df_probas)
187
 
 
194
  tab4.subheader('Performance')
195
  tab4.write(df_performance)
196
  tab4.write(perf_daily)
 
197
 
198
+ if st.button('⌚ After 30 Mins'):
199
+ from model_30m import *
200
+ with st.spinner('Loading data...'):
201
+ data, df_final, final_row = get_data()
202
+ # st.success("✅ Historical data")
203
+
204
+ with st.spinner("Training models..."):
205
+ def train_models():
206
+ res1, xgbr, seq2 = walk_forward_validation_seq(df_final.dropna(), 'Target_clf', 'Target', 100, 1)
207
+ return res1, xgbr, seq2
208
+ res1, xgbr, seq2 = train_models()
209
+ # st.success("✅ Models trained")
210
+
211
+ with st.spinner("Getting new prediction..."):
212
+
213
+ # Get last row
214
+ new_pred = data.loc[final_row, ['BigNewsDay',
215
+ 'Quarter',
216
+ 'Perf5Day',
217
+ 'Perf5Day_n1',
218
+ 'DaysGreen',
219
+ 'DaysRed',
220
+ 'CurrentHigh30toClose',
221
+ 'CurrentLow30toClose',
222
+ 'CurrentClose30toClose',
223
+ 'CurrentRange30',
224
+ 'GapFill30',
225
+ 'CurrentGap',
226
+ 'RangePct',
227
+ 'RangePct_n1',
228
+ 'RangePct_n2',
229
+ 'OHLC4_VIX',
230
+ 'OHLC4_VIX_n1',
231
+ 'OHLC4_VIX_n2']]
232
+
233
+ new_pred = pd.DataFrame(new_pred).T
234
+ # new_pred_show = pd.DataFrame(index=[new_pred.columns], columns=[new_pred.index], data=[[v] for v in new_pred.values])
235
+ # last_date = datetime.datetime.strptime(data.loc[final_row], '%Y-%m-%d')
236
+ curr_date = final_row + BDay(1)
237
+ curr_date = curr_date.strftime('%Y-%m-%d')
238
+
239
+ new_pred['BigNewsDay'] = new_pred['BigNewsDay'].astype(float)
240
+ new_pred['Quarter'] = new_pred['Quarter'].astype(int)
241
+ new_pred['Perf5Day'] = new_pred['Perf5Day'].astype(bool)
242
+ new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
243
+ new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
244
+ new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
245
+ new_pred['CurrentHigh30toClose'] = new_pred['CurrentHigh30toClose'].astype(float)
246
+ new_pred['CurrentLow30toClose'] = new_pred['CurrentLow30toClose'].astype(float)
247
+ new_pred['CurrentClose30toClose'] = new_pred['CurrentClose30toClose'].astype(float)
248
+ new_pred['CurrentRange30'] = new_pred['CurrentRange30'].astype(float)
249
+ new_pred['GapFill30'] = new_pred['GapFill30'].astype(float)
250
+ new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
251
+ new_pred['RangePct'] = new_pred['RangePct'].astype(float)
252
+ new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
253
+ new_pred['RangePct_n2'] = new_pred['RangePct_n2'].astype(float)
254
+ new_pred['OHLC4_VIX'] = new_pred['OHLC4_VIX'].astype(float)
255
+ new_pred['OHLC4_VIX_n1'] = new_pred['OHLC4_VIX_n1'].astype(float)
256
+ new_pred['OHLC4_VIX_n2'] = new_pred['OHLC4_VIX_n2'].astype(float)
257
+
258
+ st.success("✅ All done!")
259
+ tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
260
+
261
+ seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
262
+
263
+ green_proba = seq_proba[0]
264
+ red_proba = 1 - green_proba
265
+ do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
266
+ stdev = 0.01
267
+ score = None
268
+ num_obs = None
269
+ cond = None
270
+ historical_proba = None
271
+ text_cond = None
272
+ operator = None
273
+
274
+ if do_not_play:
275
+ text_cond = '🟨'
276
+ operator = ''
277
+ score = seq_proba[0]
278
+ cond = (res1['Predicted'] > 0.4) & (res1['Predicted'] <= 0.6)
279
+ num_obs = len(res1.loc[cond])
280
+ historical_proba = res1.loc[cond, 'True'].mean()
281
+
282
+
283
+ elif green_proba > red_proba:
284
+ # If the day is predicted to be green, say so
285
+ text_cond = '🟩'
286
+ operator = '>='
287
+ score = green_proba
288
+ # How many with this score?
289
+ cond = (res1['Predicted'] >= green_proba)
290
+ num_obs = len(res1.loc[cond])
291
+ # How often green?
292
+ historical_proba = res1.loc[cond, 'True'].mean()
293
+ # print(cond)
294
+
295
+ elif green_proba <= red_proba:
296
+ # If the day is predicted to be green, say so
297
+ text_cond = '🟥'
298
+ operator = '<='
299
+ score = red_proba
300
+ # How many with this score?
301
+ cond = (res1['Predicted'] <= red_proba)
302
+ num_obs = len(res1.loc[cond])
303
+ # How often green?
304
+ historical_proba = 1 - res1.loc[cond, 'True'].mean()
305
+ # print(cond)
306
+
307
+ score_fmt = f'{score:.1%}'
308
+
309
+ results = pd.DataFrame(index=[
310
+ 'PrevClose',
311
+ 'Confidence Score',
312
+ 'Success Rate',
313
+ f'NumObs {operator} {"" if do_not_play else score_fmt}',
314
+ ], data = [
315
+ f"{data.loc[final_row,'Close']:.2f}",
316
+ f'{text_cond} {score:.1%}',
317
+ f'{historical_proba:.1%}',
318
+ num_obs,
319
+ ])
320
+
321
+ results.columns = ['Outputs']
322
+
323
+ # st.subheader('New Prediction')
324
+
325
+ # df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
326
+ df_probas = res1.groupby(pd.cut(res1['Predicted'],[-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf])).agg({'True':[np.mean,len,np.sum]})
327
+ df_probas.columns = ['PctGreen','NumObs','NumGreen']
328
+
329
+ roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
330
+ precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
331
+ recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
332
+ len_all = len(res1)
333
+
334
+ res2_filtered = res1.loc[(res1['Predicted'] > 0.6) | (res1['Predicted'] <= 0.4)]
335
+
336
+ roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
337
+ precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
338
+ recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
339
+ len_hi = len(res2_filtered)
340
+
341
+ df_performance = pd.DataFrame(
342
+ index=[
343
+ 'N',
344
+ 'ROC AUC',
345
+ 'Precision',
346
+ 'Recall'
347
+ ],
348
+ columns = [
349
+ 'All',
350
+ 'High Confidence'
351
+ ],
352
+ data = [
353
+ [len_all, len_hi],
354
+ [roc_auc_score_all, roc_auc_score_hi],
355
+ [precision_score_all, precision_score_hi],
356
+ [recall_score_all, recall_score_hi]
357
+ ]
358
+ ).round(2)
359
+
360
+ def get_acc(t, p):
361
+ if t == False and p <= 0.4:
362
+ return '✅'
363
+ elif t == True and p > 0.6:
364
+ return '✅'
365
+ elif t == False and p > 0.6:
366
+ return '❌'
367
+ elif t == True and p <= 0.4:
368
+ return '❌'
369
+ else:
370
+ return '🟨'
371
+
372
+ perf_daily = res1.copy()
373
+ perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
374
+
375
+ tab1.subheader(f'Pred for {curr_date} as of 7AM PST')
376
+ tab1.write(results)
377
+ tab1.write(df_probas)
378
+
379
+ tab2.subheader('Latest Data for Pred')
380
+ tab2.write(new_pred)
381
+
382
+ tab3.subheader('Historical Data')
383
+ tab3.write(df_final)
384
+
385
+ tab4.subheader('Performance')
386
+ tab4.write(df_performance)
387
+ tab4.write(perf_daily.sort_index(ascending=False))
388
+
389
+ if st.button('⏳ After 60 Mins'):
390
+ from model_1h import *
391
+ with st.spinner('Loading data...'):
392
+ data, df_final, final_row = get_data()
393
+ # st.success("✅ Historical data")
394
+
395
+ with st.spinner("Training models..."):
396
+ def train_models():
397
+ res1, xgbr, seq2 = walk_forward_validation_seq(df_final.dropna(), 'Target_clf', 'Target', 100, 1)
398
+ return res1, xgbr, seq2
399
+ res1, xgbr, seq2 = train_models()
400
+ # st.success("✅ Models trained")
401
+
402
+ with st.spinner("Getting new prediction..."):
403
+
404
+ # Get last row
405
+ new_pred = data.loc[final_row, ['BigNewsDay',
406
+ 'Quarter',
407
+ 'Perf5Day',
408
+ 'Perf5Day_n1',
409
+ 'DaysGreen',
410
+ 'DaysRed',
411
+ 'CurrentHigh30toClose',
412
+ 'CurrentLow30toClose',
413
+ 'CurrentClose30toClose',
414
+ 'CurrentRange30',
415
+ 'GapFill30',
416
+ 'CurrentGap',
417
+ 'RangePct',
418
+ 'RangePct_n1',
419
+ 'RangePct_n2',
420
+ 'OHLC4_VIX',
421
+ 'OHLC4_VIX_n1',
422
+ 'OHLC4_VIX_n2']]
423
+
424
+ new_pred = pd.DataFrame(new_pred).T
425
+ # new_pred_show = pd.DataFrame(index=[new_pred.columns], columns=[new_pred.index], data=[[v] for v in new_pred.values])
426
+ # last_date = datetime.datetime.strptime(data.loc[final_row], '%Y-%m-%d')
427
+ curr_date = final_row + BDay(1)
428
+ curr_date = curr_date.strftime('%Y-%m-%d')
429
+
430
+ new_pred['BigNewsDay'] = new_pred['BigNewsDay'].astype(float)
431
+ new_pred['Quarter'] = new_pred['Quarter'].astype(int)
432
+ new_pred['Perf5Day'] = new_pred['Perf5Day'].astype(bool)
433
+ new_pred['Perf5Day_n1'] = new_pred['Perf5Day_n1'].astype(bool)
434
+ new_pred['DaysGreen'] = new_pred['DaysGreen'].astype(float)
435
+ new_pred['DaysRed'] = new_pred['DaysRed'].astype(float)
436
+ new_pred['CurrentHigh30toClose'] = new_pred['CurrentHigh30toClose'].astype(float)
437
+ new_pred['CurrentLow30toClose'] = new_pred['CurrentLow30toClose'].astype(float)
438
+ new_pred['CurrentClose30toClose'] = new_pred['CurrentClose30toClose'].astype(float)
439
+ new_pred['CurrentRange30'] = new_pred['CurrentRange30'].astype(float)
440
+ new_pred['GapFill30'] = new_pred['GapFill30'].astype(float)
441
+ new_pred['CurrentGap'] = new_pred['CurrentGap'].astype(float)
442
+ new_pred['RangePct'] = new_pred['RangePct'].astype(float)
443
+ new_pred['RangePct_n1'] = new_pred['RangePct_n1'].astype(float)
444
+ new_pred['RangePct_n2'] = new_pred['RangePct_n2'].astype(float)
445
+ new_pred['OHLC4_VIX'] = new_pred['OHLC4_VIX'].astype(float)
446
+ new_pred['OHLC4_VIX_n1'] = new_pred['OHLC4_VIX_n1'].astype(float)
447
+ new_pred['OHLC4_VIX_n2'] = new_pred['OHLC4_VIX_n2'].astype(float)
448
+
449
+ st.success("✅ All done!")
450
+ tab1, tab2, tab3, tab4 = st.tabs(["🔮 Prediction", "✨ New Data", "🗄 Historical", "📊 Performance"])
451
+
452
+ seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
453
+
454
+ green_proba = seq_proba[0]
455
+ red_proba = 1 - green_proba
456
+ do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
457
+ stdev = 0.01
458
+ score = None
459
+ num_obs = None
460
+ cond = None
461
+ historical_proba = None
462
+ text_cond = None
463
+ operator = None
464
+
465
+ if do_not_play:
466
+ text_cond = '🟨'
467
+ operator = ''
468
+ score = seq_proba[0]
469
+ cond = (res1['Predicted'] > 0.4) & (res1['Predicted'] <= 0.6)
470
+ num_obs = len(res1.loc[cond])
471
+ historical_proba = res1.loc[cond, 'True'].mean()
472
 
473
+
474
+ elif green_proba > red_proba:
475
+ # If the day is predicted to be green, say so
476
+ text_cond = '🟩'
477
+ operator = '>='
478
+ score = green_proba
479
+ # How many with this score?
480
+ cond = (res1['Predicted'] >= green_proba)
481
+ num_obs = len(res1.loc[cond])
482
+ # How often green?
483
+ historical_proba = res1.loc[cond, 'True'].mean()
484
+ # print(cond)
485
 
486
+ elif green_proba <= red_proba:
487
+ # If the day is predicted to be green, say so
488
+ text_cond = '🟥'
489
+ operator = '<='
490
+ score = red_proba
491
+ # How many with this score?
492
+ cond = (res1['Predicted'] <= red_proba)
493
+ num_obs = len(res1.loc[cond])
494
+ # How often green?
495
+ historical_proba = 1 - res1.loc[cond, 'True'].mean()
496
+ # print(cond)
497
+
498
+ score_fmt = f'{score:.1%}'
499
+
500
+ results = pd.DataFrame(index=[
501
+ 'PrevClose',
502
+ 'Confidence Score',
503
+ 'Success Rate',
504
+ f'NumObs {operator} {"" if do_not_play else score_fmt}',
505
+ ], data = [
506
+ f"{data.loc[final_row,'Close']:.2f}",
507
+ f'{text_cond} {score:.1%}',
508
+ f'{historical_proba:.1%}',
509
+ num_obs,
510
+ ])
511
+
512
+ results.columns = ['Outputs']
513
+
514
+ # st.subheader('New Prediction')
515
+
516
+ # df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
517
+ df_probas = res1.groupby(pd.cut(res1['Predicted'],[-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf])).agg({'True':[np.mean,len,np.sum]})
518
+ df_probas.columns = ['PctGreen','NumObs','NumGreen']
519
+
520
+ roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
521
+ precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
522
+ recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
523
+ len_all = len(res1)
524
+
525
+ res2_filtered = res1.loc[(res1['Predicted'] > 0.6) | (res1['Predicted'] <= 0.4)]
526
+
527
+ roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
528
+ precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
529
+ recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
530
+ len_hi = len(res2_filtered)
531
+
532
+ df_performance = pd.DataFrame(
533
+ index=[
534
+ 'N',
535
+ 'ROC AUC',
536
+ 'Precision',
537
+ 'Recall'
538
+ ],
539
+ columns = [
540
+ 'All',
541
+ 'High Confidence'
542
+ ],
543
+ data = [
544
+ [len_all, len_hi],
545
+ [roc_auc_score_all, roc_auc_score_hi],
546
+ [precision_score_all, precision_score_hi],
547
+ [recall_score_all, recall_score_hi]
548
+ ]
549
+ ).round(2)
550
+
551
+ def get_acc(t, p):
552
+ if t == False and p <= 0.4:
553
+ return '✅'
554
+ elif t == True and p > 0.6:
555
+ return '✅'
556
+ elif t == False and p > 0.6:
557
+ return '❌'
558
+ elif t == True and p <= 0.4:
559
+ return '❌'
560
+ else:
561
+ return '🟨'
562
+
563
+ perf_daily = res1.copy()
564
+ perf_daily['Accuracy'] = [get_acc(t, p) for t, p in zip(perf_daily['True'], perf_daily['Predicted'])]
565
+
566
+ tab1.subheader(f'Pred for {curr_date} as of 7:30AM PST')
567
+ tab1.write(results)
568
+ tab1.write(df_probas)
569
+
570
+ tab2.subheader('Latest Data for Pred')
571
+ tab2.write(new_pred)
572
+
573
+ tab3.subheader('Historical Data')
574
+ tab3.write(df_final)
575
+
576
+ tab4.subheader('Performance')
577
+ tab4.write(df_performance)
578
+ tab4.write(perf_daily)
model_1h.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
21
+
22
+ # Create an XGBRegressor model
23
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
24
+ model = linear_model.LinearRegression()
25
+
26
+ overall_results = []
27
+ # Iterate over the rows in the DataFrame, one step at a time
28
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
29
+ # Split the data into training and test sets
30
+ X_train = df.drop(target_column, axis=1).iloc[:i]
31
+ y_train = df[target_column].iloc[:i]
32
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
33
+ y_test = df[target_column].iloc[i:i+num_periods]
34
+
35
+ # Fit the model to the training data
36
+ model.fit(X_train, y_train)
37
+
38
+ # Make a prediction on the test data
39
+ predictions = model.predict(X_test)
40
+
41
+ # Create a DataFrame to store the true and predicted values
42
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
43
+
44
+ overall_results.append(result_df)
45
+
46
+ df_results = pd.concat(overall_results)
47
+ # model.save_model('model_lr.bin')
48
+ # Return the true and predicted values, and fitted model
49
+ return df_results, model
50
+
51
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
52
+
53
+ # Create run the regression model to get its target
54
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
55
+ # joblib.dump(model1, 'model1.bin')
56
+
57
+ # Merge the result df back on the df for feeding into the classifier
58
+ for_merge = res[['Predicted']]
59
+ for_merge.columns = ['RegrModelOut']
60
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
61
+ df = df.merge(for_merge, left_index=True, right_index=True)
62
+ df = df.drop(columns=[target_column_regr])
63
+ df = df[[
64
+ 'CurrentGap','RegrModelOut',
65
+ 'CurrentHigh30toClose',
66
+ 'CurrentLow30toClose',
67
+ 'CurrentClose30toClose',
68
+ 'CurrentRange30',
69
+ 'GapFill30',target_column_clf
70
+ ]]
71
+
72
+ df[target_column_clf] = df[target_column_clf].astype(bool)
73
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
+
75
+ # Create an XGBRegressor model
76
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
+ # model = linear_model.LogisticRegression(max_iter=1500)
78
+
79
+ overall_results = []
80
+ # Iterate over the rows in the DataFrame, one step at a time
81
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
+ # Split the data into training and test sets
83
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
+ y_train = df[target_column_clf].iloc[:i]
85
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
87
+
88
+ # Fit the model to the training data
89
+ model2.fit(X_train, y_train)
90
+
91
+ # Make a prediction on the test data
92
+ predictions = model2.predict_proba(X_test)[:,-1]
93
+
94
+ # Create a DataFrame to store the true and predicted values
95
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
+
97
+ overall_results.append(result_df)
98
+
99
+ df_results = pd.concat(overall_results)
100
+ # model1.save_model('model_ensemble.bin')
101
+ # joblib.dump(model2, 'model2.bin')
102
+ # Return the true and predicted values, and fitted model
103
+ return df_results, model1, model2
104
+
105
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
106
+ regr_pred = trained_reg_model.predict(df)
107
+ regr_pred = regr_pred > 0
108
+ new_df = df.copy()
109
+ new_df['RegrModelOut'] = regr_pred
110
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut',
111
+ 'CurrentHigh30toClose',
112
+ 'CurrentLow30toClose',
113
+ 'CurrentClose30toClose',
114
+ 'CurrentRange30',
115
+ 'GapFill30']])[:,-1]
116
+ return clf_pred_proba
117
+
118
+ def get_data():
119
+ # f = open('settings.json')
120
+ # j = json.load(f)
121
+ # API_KEY_FRED = j["API_KEY_FRED"]
122
+
123
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
124
+
125
+ def parse_release_dates(release_id: str) -> List[str]:
126
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
127
+ r = requests.get(release_dates_url)
128
+ text = r.text
129
+ soup = BeautifulSoup(text, 'xml')
130
+ dates = []
131
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
132
+ dates.append(release_date_tag.text)
133
+ return dates
134
+
135
+ def parse_release_dates_obs(series_id: str) -> List[str]:
136
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
137
+ r = requests.get(obs_url)
138
+ text = r.text
139
+ soup = BeautifulSoup(text, 'xml')
140
+ observations = []
141
+ for observation_tag in soup.find_all('observation'):
142
+ date = observation_tag.get('date')
143
+ value = observation_tag.get('value')
144
+ observations.append((date, value))
145
+ return observations
146
+
147
+ econ_dfs = {}
148
+
149
+ econ_tickers = [
150
+ 'WALCL',
151
+ 'NFCI',
152
+ 'WRESBAL'
153
+ ]
154
+
155
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
156
+ # p = parse_release_dates_obs(et)
157
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
158
+ df = pdr.get_data_fred(et)
159
+ df.index = df.index.rename('ds')
160
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
161
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
162
+ econ_dfs[et] = df
163
+
164
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
165
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
166
+
167
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
168
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
169
+
170
+ release_ids = [
171
+ "10", # "Consumer Price Index"
172
+ "46", # "Producer Price Index"
173
+ "50", # "Employment Situation"
174
+ "53", # "Gross Domestic Product"
175
+ "103", # "Discount Rate Meeting Minutes"
176
+ "180", # "Unemployment Insurance Weekly Claims Report"
177
+ "194", # "ADP National Employment Report"
178
+ "323" # "Trimmed Mean PCE Inflation Rate"
179
+ ]
180
+
181
+ release_names = [
182
+ "CPI",
183
+ "PPI",
184
+ "NFP",
185
+ "GDP",
186
+ "FOMC",
187
+ "UNEMP",
188
+ "ADP",
189
+ "PCE"
190
+ ]
191
+
192
+ releases = {}
193
+
194
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
195
+ releases[rid] = {}
196
+ releases[rid]['dates'] = parse_release_dates(rid)
197
+ releases[rid]['name'] = n
198
+
199
+ # Create a DF that has all dates with the name of the col as 1
200
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
201
+ # This column serves as the true/false indicator of whether there was economic data released that day.
202
+ for rid in tqdm(release_ids, desc='Making indicators'):
203
+ releases[rid]['df'] = pd.DataFrame(
204
+ index=releases[rid]['dates'],
205
+ data={
206
+ releases[rid]['name']: 1
207
+ })
208
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
209
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
210
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
211
+
212
+ vix = yf.Ticker('^VIX')
213
+ spx = yf.Ticker('^GSPC')
214
+
215
+
216
+ # Pull in data
217
+ data = load_dataset("boomsss/SPX_full_30min", split='train')
218
+
219
+ rows = [d['text'] for d in data]
220
+ rows = [x.split(',') for x in rows]
221
+
222
+ fr = pd.DataFrame(columns=[
223
+ 'Datetime','Open','High','Low','Close'
224
+ ], data = rows)
225
+
226
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
227
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
228
+ fr = fr.set_index('Datetime')
229
+ fr['Open'] = pd.to_numeric(fr['Open'])
230
+ fr['High'] = pd.to_numeric(fr['High'])
231
+ fr['Low'] = pd.to_numeric(fr['Low'])
232
+ fr['Close'] = pd.to_numeric(fr['Close'])
233
+
234
+ # Get incremental date
235
+ last_date = fr.index.date[-1]
236
+ last_date = last_date + datetime.timedelta(days=1)
237
+ # Get incremental data
238
+ spx1 = yf.Ticker('^GSPC')
239
+ yfp = spx1.history(start=last_date, interval='60m')
240
+ # Concat current and incremental
241
+ df_30m = pd.concat([fr, yfp])
242
+ # Get the first 30 minute bar
243
+ df_30m = df_30m.reset_index()
244
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
245
+ df_30m = df_30m.groupby('Datetime').head(2)
246
+ df_30m = df_30m.set_index('Datetime',drop=True)
247
+ # Rename the columns
248
+ df_30m = df_30m[['Open','High','Low','Close']]
249
+
250
+ opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
251
+ closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
252
+ highs_1h = df_30m.groupby('Datetime')['High'].max()
253
+ lows_1h = df_30m.groupby('Datetime')['Low'].min()
254
+
255
+ df_1h = pd.DataFrame(index=df_30m.index.unique())
256
+ df_1h['Open'] = opens_1h
257
+ df_1h['Close'] = closes_1h
258
+ df_1h['High'] = highs_1h
259
+ df_1h['Low'] = lows_1h
260
+
261
+ df_1h.columns = ['Open30','High30','Low30','Close30']
262
+
263
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
264
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
265
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
266
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
267
+ prices_spx.index = prices_spx['index']
268
+ prices_spx = prices_spx.drop(columns='index')
269
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
270
+
271
+
272
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
273
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
274
+ prices_vix.index = prices_vix['index']
275
+ prices_vix = prices_vix.drop(columns='index')
276
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
277
+
278
+
279
+ data = prices_spx.merge(df_1h, left_index=True, right_index=True)
280
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
281
+
282
+ # Features
283
+ data['PrevClose'] = data['Close'].shift(1)
284
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
285
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
286
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
287
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
288
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
289
+
290
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
291
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
292
+
293
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
294
+ data['RangePct'] = data['Range'] / data['Close']
295
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
296
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
297
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
298
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
299
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
300
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
301
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
302
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
303
+ data['RangePct_n1'] = data['RangePct'].shift(1)
304
+ data['RangePct_n2'] = data['RangePct'].shift(2)
305
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
306
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
307
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
308
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
309
+ data['DayOfWeek'] = pd.to_datetime(data.index)
310
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
311
+
312
+ # Intraday features
313
+ data['CurrentHigh30'] = data['High30'].shift(-1)
314
+ data['CurrentLow30'] = data['Low30'].shift(-1)
315
+ data['CurrentClose30'] = data['Close30'].shift(-1)
316
+
317
+ # Open to High
318
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
319
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
320
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
321
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
322
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
323
+
324
+ # Target -- the next day's low
325
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
326
+ data['Target'] = data['Target'].shift(-1)
327
+ # data['Target'] = data['RangePct'].shift(-1)
328
+
329
+ # Target for clf -- whether tomorrow will close above or below today's close
330
+ data['Target_clf'] = data['Close'] > data['PrevClose']
331
+ data['Target_clf'] = data['Target_clf'].shift(-1)
332
+ data['DayOfWeek'] = pd.to_datetime(data.index)
333
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
334
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
335
+
336
+ for rid in tqdm(release_ids, desc='Merging econ data'):
337
+ # Get the name of the release
338
+ n = releases[rid]['name']
339
+ # Merge the corresponding DF of the release
340
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
341
+ # Create a column that shifts the value in the merged column up by 1
342
+ data[f'{n}_shift'] = data[n].shift(-1)
343
+ # Fill the rest with zeroes
344
+ data[n] = data[n].fillna(0)
345
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
346
+
347
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
348
+
349
+ def cumul_sum(col):
350
+ nums = []
351
+ s = 0
352
+ for x in col:
353
+ if x == 1:
354
+ s += 1
355
+ elif x == 0:
356
+ s = 0
357
+ nums.append(s)
358
+ return nums
359
+
360
+ consec_green = cumul_sum(data['GreenDay'].values)
361
+ consec_red = cumul_sum(data['RedDay'].values)
362
+
363
+ data['DaysGreen'] = consec_green
364
+ data['DaysRed'] = consec_red
365
+
366
+ final_row = data.index[-2]
367
+
368
+ exp_row = data.index[-1]
369
+
370
+ df_final = data.loc[:final_row,
371
+ [
372
+ 'BigNewsDay',
373
+ 'Quarter',
374
+ 'Perf5Day',
375
+ 'Perf5Day_n1',
376
+ 'DaysGreen',
377
+ 'DaysRed',
378
+ 'CurrentHigh30toClose',
379
+ 'CurrentLow30toClose',
380
+ 'CurrentClose30toClose',
381
+ 'CurrentRange30',
382
+ 'GapFill30',
383
+ # 'OHLC4_Trend',
384
+ # 'OHLC4_Trend_n1',
385
+ # 'OHLC4_Trend_n2',
386
+ # 'VIX5Day',
387
+ # 'VIX5Day_n1',
388
+ 'CurrentGap',
389
+ 'RangePct',
390
+ 'RangePct_n1',
391
+ 'RangePct_n2',
392
+ 'OHLC4_VIX',
393
+ 'OHLC4_VIX_n1',
394
+ 'OHLC4_VIX_n2',
395
+ 'Target',
396
+ 'Target_clf'
397
+ ]]
398
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
399
+ return data, df_final, final_row
model_30m.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
21
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
22
+
23
+ # Create an XGBRegressor model
24
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
25
+ model = linear_model.LinearRegression()
26
+
27
+ overall_results = []
28
+ # Iterate over the rows in the DataFrame, one step at a time
29
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
30
+ # Split the data into training and test sets
31
+ X_train = df.drop(target_column, axis=1).iloc[:i]
32
+ y_train = df[target_column].iloc[:i]
33
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
34
+ y_test = df[target_column].iloc[i:i+num_periods]
35
+
36
+ # Fit the model to the training data
37
+ model.fit(X_train, y_train)
38
+
39
+ # Make a prediction on the test data
40
+ predictions = model.predict(X_test)
41
+
42
+ # Create a DataFrame to store the true and predicted values
43
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
+
45
+ overall_results.append(result_df)
46
+
47
+ df_results = pd.concat(overall_results)
48
+ # model.save_model('model_lr.bin')
49
+ # Return the true and predicted values, and fitted model
50
+ return df_results, model
51
+
52
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
53
+
54
+ # Create run the regression model to get its target
55
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
56
+ # joblib.dump(model1, 'model1.bin')
57
+
58
+ # Merge the result df back on the df for feeding into the classifier
59
+ for_merge = res[['Predicted']]
60
+ for_merge.columns = ['RegrModelOut']
61
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
62
+ df = df.merge(for_merge, left_index=True, right_index=True)
63
+ df = df.drop(columns=[target_column_regr])
64
+ df = df[[
65
+ 'CurrentGap','RegrModelOut','CurrentHigh30toClose',
66
+ 'CurrentLow30toClose',
67
+ 'CurrentClose30toClose',
68
+ 'CurrentRange30',
69
+ 'GapFill30', target_column_clf
70
+ ]]
71
+
72
+ df[target_column_clf] = df[target_column_clf].astype(bool)
73
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
+
75
+ # Create an XGBRegressor model
76
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
+ # model = linear_model.LogisticRegression(max_iter=1500)
78
+
79
+ overall_results = []
80
+ # Iterate over the rows in the DataFrame, one step at a time
81
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
+ # Split the data into training and test sets
83
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
+ y_train = df[target_column_clf].iloc[:i]
85
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
87
+
88
+ # Fit the model to the training data
89
+ model2.fit(X_train, y_train)
90
+
91
+ # Make a prediction on the test data
92
+ predictions = model2.predict_proba(X_test)[:,-1]
93
+
94
+ # Create a DataFrame to store the true and predicted values
95
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
+
97
+ overall_results.append(result_df)
98
+
99
+ df_results = pd.concat(overall_results)
100
+ return df_results, model1, model2
101
+
102
+
103
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
+ regr_pred = trained_reg_model.predict(df)
105
+ regr_pred = regr_pred > 0
106
+ new_df = df.copy()
107
+ new_df['RegrModelOut'] = regr_pred
108
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut','CurrentHigh30toClose',
109
+ 'CurrentLow30toClose',
110
+ 'CurrentClose30toClose',
111
+ 'CurrentRange30',
112
+ 'GapFill30']])[:,-1]
113
+ return clf_pred_proba
114
+
115
+ def get_data():
116
+ # f = open('settings.json')
117
+ # j = json.load(f)
118
+ # API_KEY_FRED = j["API_KEY_FRED"]
119
+
120
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
121
+
122
+ def parse_release_dates(release_id: str) -> List[str]:
123
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
124
+ r = requests.get(release_dates_url)
125
+ text = r.text
126
+ soup = BeautifulSoup(text, 'xml')
127
+ dates = []
128
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
129
+ dates.append(release_date_tag.text)
130
+ return dates
131
+
132
+ def parse_release_dates_obs(series_id: str) -> List[str]:
133
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
134
+ r = requests.get(obs_url)
135
+ text = r.text
136
+ soup = BeautifulSoup(text, 'xml')
137
+ observations = []
138
+ for observation_tag in soup.find_all('observation'):
139
+ date = observation_tag.get('date')
140
+ value = observation_tag.get('value')
141
+ observations.append((date, value))
142
+ return observations
143
+
144
+ econ_dfs = {}
145
+
146
+ econ_tickers = [
147
+ 'WALCL',
148
+ 'NFCI',
149
+ 'WRESBAL'
150
+ ]
151
+
152
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
153
+ # p = parse_release_dates_obs(et)
154
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
155
+ df = pdr.get_data_fred(et)
156
+ df.index = df.index.rename('ds')
157
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
158
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
159
+ econ_dfs[et] = df
160
+
161
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
162
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
163
+
164
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
165
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
166
+
167
+ release_ids = [
168
+ "10", # "Consumer Price Index"
169
+ "46", # "Producer Price Index"
170
+ "50", # "Employment Situation"
171
+ "53", # "Gross Domestic Product"
172
+ "103", # "Discount Rate Meeting Minutes"
173
+ "180", # "Unemployment Insurance Weekly Claims Report"
174
+ "194", # "ADP National Employment Report"
175
+ "323" # "Trimmed Mean PCE Inflation Rate"
176
+ ]
177
+
178
+ release_names = [
179
+ "CPI",
180
+ "PPI",
181
+ "NFP",
182
+ "GDP",
183
+ "FOMC",
184
+ "UNEMP",
185
+ "ADP",
186
+ "PCE"
187
+ ]
188
+
189
+ releases = {}
190
+
191
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
192
+ releases[rid] = {}
193
+ releases[rid]['dates'] = parse_release_dates(rid)
194
+ releases[rid]['name'] = n
195
+
196
+ # Create a DF that has all dates with the name of the col as 1
197
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
198
+ # This column serves as the true/false indicator of whether there was economic data released that day.
199
+ for rid in tqdm(release_ids, desc='Making indicators'):
200
+ releases[rid]['df'] = pd.DataFrame(
201
+ index=releases[rid]['dates'],
202
+ data={
203
+ releases[rid]['name']: 1
204
+ })
205
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
206
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
207
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
208
+
209
+ vix = yf.Ticker('^VIX')
210
+ spx = yf.Ticker('^GSPC')
211
+
212
+ # Pull in data
213
+ data = load_dataset("boomsss/SPX_full_30min", split='train')
214
+
215
+ rows = [d['text'] for d in data]
216
+ rows = [x.split(',') for x in rows]
217
+
218
+ fr = pd.DataFrame(columns=[
219
+ 'Datetime','Open','High','Low','Close'
220
+ ], data = rows)
221
+
222
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
223
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
224
+ fr = fr.set_index('Datetime')
225
+ fr['Open'] = pd.to_numeric(fr['Open'])
226
+ fr['High'] = pd.to_numeric(fr['High'])
227
+ fr['Low'] = pd.to_numeric(fr['Low'])
228
+ fr['Close'] = pd.to_numeric(fr['Close'])
229
+
230
+ # Set index for ready to concat
231
+
232
+
233
+ # Get incremental date
234
+ last_date = fr.index.date[-1]
235
+ last_date = last_date + datetime.timedelta(days=1)
236
+ # Get incremental data
237
+ spx1 = yf.Ticker('^GSPC')
238
+ yfp = spx1.history(start=last_date, interval='30m')
239
+ # Concat current and incremental
240
+ df_30m = pd.concat([fr, yfp])
241
+ # Get the first 30 minute bar
242
+ df_30m = df_30m.reset_index()
243
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
244
+ df_30m = df_30m.groupby('Datetime').head(1)
245
+ df_30m = df_30m.set_index('Datetime',drop=True)
246
+ # Rename the columns
247
+ df_30m = df_30m[['Open','High','Low','Close']]
248
+ df_30m.columns = ['Open30','High30','Low30','Close30']
249
+
250
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
251
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
252
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
253
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
254
+ prices_spx.index = prices_spx['index']
255
+ prices_spx = prices_spx.drop(columns='index')
256
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
257
+
258
+
259
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
260
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
261
+ prices_vix.index = prices_vix['index']
262
+ prices_vix = prices_vix.drop(columns='index')
263
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
264
+
265
+
266
+ data = prices_spx.merge(df_30m, left_index=True, right_index=True)
267
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
268
+
269
+ # Features
270
+ data['PrevClose'] = data['Close'].shift(1)
271
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
272
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
273
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
274
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
275
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
276
+
277
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
278
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
279
+
280
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
281
+ data['RangePct'] = data['Range'] / data['Close']
282
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
283
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
284
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
285
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
286
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
287
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
288
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
289
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
290
+ data['RangePct_n1'] = data['RangePct'].shift(1)
291
+ data['RangePct_n2'] = data['RangePct'].shift(2)
292
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
293
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
294
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
295
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
296
+ data['DayOfWeek'] = pd.to_datetime(data.index)
297
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
298
+
299
+ # Intraday features
300
+ data['CurrentHigh30'] = data['High30'].shift(-1)
301
+ data['CurrentLow30'] = data['Low30'].shift(-1)
302
+ data['CurrentClose30'] = data['Close30'].shift(-1)
303
+
304
+ # Open to High
305
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
306
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
307
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
308
+
309
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
310
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
311
+
312
+ # Target -- the next day's low
313
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
314
+ data['Target'] = data['Target'].shift(-1)
315
+ # data['Target'] = data['RangePct'].shift(-1)
316
+
317
+ # Target for clf -- whether tomorrow will close above or below today's close
318
+ data['Target_clf'] = data['Close'] > data['PrevClose']
319
+ data['Target_clf'] = data['Target_clf'].shift(-1)
320
+ data['DayOfWeek'] = pd.to_datetime(data.index)
321
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
322
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
323
+
324
+ for rid in tqdm(release_ids, desc='Merging econ data'):
325
+ # Get the name of the release
326
+ n = releases[rid]['name']
327
+ # Merge the corresponding DF of the release
328
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
329
+ # Create a column that shifts the value in the merged column up by 1
330
+ data[f'{n}_shift'] = data[n].shift(-1)
331
+ # Fill the rest with zeroes
332
+ data[n] = data[n].fillna(0)
333
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
334
+
335
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
336
+
337
+ def cumul_sum(col):
338
+ nums = []
339
+ s = 0
340
+ for x in col:
341
+ if x == 1:
342
+ s += 1
343
+ elif x == 0:
344
+ s = 0
345
+ nums.append(s)
346
+ return nums
347
+
348
+ consec_green = cumul_sum(data['GreenDay'].values)
349
+ consec_red = cumul_sum(data['RedDay'].values)
350
+
351
+ data['DaysGreen'] = consec_green
352
+ data['DaysRed'] = consec_red
353
+
354
+ final_row = data.index[-2]
355
+
356
+ exp_row = data.index[-1]
357
+
358
+ df_final = data.loc[:final_row,
359
+ [
360
+ 'BigNewsDay',
361
+ 'Quarter',
362
+ 'Perf5Day',
363
+ 'Perf5Day_n1',
364
+ 'DaysGreen',
365
+ 'DaysRed',
366
+ 'CurrentHigh30toClose',
367
+ 'CurrentLow30toClose',
368
+ 'CurrentClose30toClose',
369
+ 'CurrentRange30',
370
+ 'GapFill30',
371
+ # 'OHLC4_Trend',
372
+ # 'OHLC4_Trend_n1',
373
+ # 'OHLC4_Trend_n2',
374
+ # 'VIX5Day',
375
+ # 'VIX5Day_n1',
376
+ 'CurrentGap',
377
+ 'RangePct',
378
+ 'RangePct_n1',
379
+ 'RangePct_n2',
380
+ 'OHLC4_VIX',
381
+ 'OHLC4_VIX_n1',
382
+ 'OHLC4_VIX_n2',
383
+ 'Target',
384
+ 'Target_clf'
385
+ ]]
386
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
387
+ return data, df_final, final_row
model_day.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
21
+ dataset = load_dataset("boomsss/SPX_full_30min", split="train")
22
+
23
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
24
+
25
+ # Create an XGBRegressor model
26
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
27
+ model = linear_model.LinearRegression()
28
+
29
+ overall_results = []
30
+ # Iterate over the rows in the DataFrame, one step at a time
31
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
32
+ # Split the data into training and test sets
33
+ X_train = df.drop(target_column, axis=1).iloc[:i]
34
+ y_train = df[target_column].iloc[:i]
35
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
36
+ y_test = df[target_column].iloc[i:i+num_periods]
37
+
38
+ # Fit the model to the training data
39
+ model.fit(X_train, y_train)
40
+
41
+ # Make a prediction on the test data
42
+ predictions = model.predict(X_test)
43
+
44
+ # Create a DataFrame to store the true and predicted values
45
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
46
+
47
+ overall_results.append(result_df)
48
+
49
+ df_results = pd.concat(overall_results)
50
+ # model.save_model('model_lr.bin')
51
+ # Return the true and predicted values, and fitted model
52
+ return df_results, model
53
+
54
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
55
+
56
+ # Create run the regression model to get its target
57
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
58
+ # joblib.dump(model1, 'model1.bin')
59
+
60
+ # Merge the result df back on the df for feeding into the classifier
61
+ for_merge = res[['Predicted']]
62
+ for_merge.columns = ['RegrModelOut']
63
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
64
+ df = df.merge(for_merge, left_index=True, right_index=True)
65
+ df = df.drop(columns=[target_column_regr])
66
+ df = df[[
67
+ 'CurrentGap','RegrModelOut',target_column_clf
68
+ ]]
69
+
70
+ df[target_column_clf] = df[target_column_clf].astype(bool)
71
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
72
+
73
+ # Create an XGBRegressor model
74
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
75
+ # model = linear_model.LogisticRegression(max_iter=1500)
76
+
77
+ overall_results = []
78
+ # Iterate over the rows in the DataFrame, one step at a time
79
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
80
+ # Split the data into training and test sets
81
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
82
+ y_train = df[target_column_clf].iloc[:i]
83
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
84
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
85
+
86
+ # Fit the model to the training data
87
+ model2.fit(X_train, y_train)
88
+
89
+ # Make a prediction on the test data
90
+ predictions = model2.predict_proba(X_test)[:,-1]
91
+
92
+ # Create a DataFrame to store the true and predicted values
93
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
94
+
95
+ overall_results.append(result_df)
96
+
97
+ df_results = pd.concat(overall_results)
98
+ # model1.save_model('model_ensemble.bin')
99
+ # joblib.dump(model2, 'model2.bin')
100
+ # Return the true and predicted values, and fitted model
101
+ return df_results, model1, model2
102
+
103
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
+ regr_pred = trained_reg_model.predict(df)
105
+ regr_pred = regr_pred > 0
106
+ new_df = df.copy()
107
+ new_df['RegrModelOut'] = regr_pred
108
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
109
+ return clf_pred_proba
110
+
111
+ def get_data():
112
+ # f = open('settings.json')
113
+ # j = json.load(f)
114
+ # API_KEY_FRED = j["API_KEY_FRED"]
115
+
116
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
117
+
118
+ def parse_release_dates(release_id: str) -> List[str]:
119
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
120
+ r = requests.get(release_dates_url)
121
+ text = r.text
122
+ soup = BeautifulSoup(text, 'xml')
123
+ dates = []
124
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
125
+ dates.append(release_date_tag.text)
126
+ return dates
127
+
128
+ def parse_release_dates_obs(series_id: str) -> List[str]:
129
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
130
+ r = requests.get(obs_url)
131
+ text = r.text
132
+ soup = BeautifulSoup(text, 'xml')
133
+ observations = []
134
+ for observation_tag in soup.find_all('observation'):
135
+ date = observation_tag.get('date')
136
+ value = observation_tag.get('value')
137
+ observations.append((date, value))
138
+ return observations
139
+
140
+ econ_dfs = {}
141
+
142
+ econ_tickers = [
143
+ 'WALCL',
144
+ 'NFCI',
145
+ 'WRESBAL'
146
+ ]
147
+
148
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
149
+ # p = parse_release_dates_obs(et)
150
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
151
+ df = pdr.get_data_fred(et)
152
+ df.index = df.index.rename('ds')
153
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
154
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
155
+ econ_dfs[et] = df
156
+
157
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
158
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
159
+
160
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
161
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
162
+
163
+ release_ids = [
164
+ "10", # "Consumer Price Index"
165
+ "46", # "Producer Price Index"
166
+ "50", # "Employment Situation"
167
+ "53", # "Gross Domestic Product"
168
+ "103", # "Discount Rate Meeting Minutes"
169
+ "180", # "Unemployment Insurance Weekly Claims Report"
170
+ "194", # "ADP National Employment Report"
171
+ "323" # "Trimmed Mean PCE Inflation Rate"
172
+ ]
173
+
174
+ release_names = [
175
+ "CPI",
176
+ "PPI",
177
+ "NFP",
178
+ "GDP",
179
+ "FOMC",
180
+ "UNEMP",
181
+ "ADP",
182
+ "PCE"
183
+ ]
184
+
185
+ releases = {}
186
+
187
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
188
+ releases[rid] = {}
189
+ releases[rid]['dates'] = parse_release_dates(rid)
190
+ releases[rid]['name'] = n
191
+
192
+ # Create a DF that has all dates with the name of the col as 1
193
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
194
+ # This column serves as the true/false indicator of whether there was economic data released that day.
195
+ for rid in tqdm(release_ids, desc='Making indicators'):
196
+ releases[rid]['df'] = pd.DataFrame(
197
+ index=releases[rid]['dates'],
198
+ data={
199
+ releases[rid]['name']: 1
200
+ })
201
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
202
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
203
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
204
+
205
+ vix = yf.Ticker('^VIX')
206
+ spx = yf.Ticker('^GSPC')
207
+
208
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
209
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
210
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
211
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
212
+ prices_spx.index = prices_spx['index']
213
+ prices_spx = prices_spx.drop(columns='index')
214
+
215
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
216
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
217
+ prices_vix.index = prices_vix['index']
218
+ prices_vix = prices_vix.drop(columns='index')
219
+
220
+ data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
221
+ data.index = pd.DatetimeIndex(data.index)
222
+
223
+ # Features
224
+ data['PrevClose'] = data['Close'].shift(1)
225
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
226
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
227
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
228
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
229
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
230
+
231
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
232
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
233
+
234
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
235
+ data['RangePct'] = data['Range'] / data['Close']
236
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
237
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
238
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
239
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
240
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
241
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
242
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
243
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
244
+ data['RangePct_n1'] = data['RangePct'].shift(1)
245
+ data['RangePct_n2'] = data['RangePct'].shift(2)
246
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
247
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
248
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
249
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
250
+ data['DayOfWeek'] = pd.to_datetime(data.index)
251
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
252
+
253
+ # Target -- the next day's low
254
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
255
+ data['Target'] = data['Target'].shift(-1)
256
+ # data['Target'] = data['RangePct'].shift(-1)
257
+
258
+ # Target for clf -- whether tomorrow will close above or below today's close
259
+ data['Target_clf'] = data['Close'] > data['PrevClose']
260
+ data['Target_clf'] = data['Target_clf'].shift(-1)
261
+ data['DayOfWeek'] = pd.to_datetime(data.index)
262
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
263
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
264
+
265
+ for rid in tqdm(release_ids, desc='Merging econ data'):
266
+ # Get the name of the release
267
+ n = releases[rid]['name']
268
+ # Merge the corresponding DF of the release
269
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
270
+ # Create a column that shifts the value in the merged column up by 1
271
+ data[f'{n}_shift'] = data[n].shift(-1)
272
+ # Fill the rest with zeroes
273
+ data[n] = data[n].fillna(0)
274
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
275
+
276
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
277
+
278
+ def cumul_sum(col):
279
+ nums = []
280
+ s = 0
281
+ for x in col:
282
+ if x == 1:
283
+ s += 1
284
+ elif x == 0:
285
+ s = 0
286
+ nums.append(s)
287
+ return nums
288
+
289
+ consec_green = cumul_sum(data['GreenDay'].values)
290
+ consec_red = cumul_sum(data['RedDay'].values)
291
+
292
+ data['DaysGreen'] = consec_green
293
+ data['DaysRed'] = consec_red
294
+
295
+ final_row = data.index[-2]
296
+
297
+ exp_row = data.index[-1]
298
+
299
+ df_final = data.loc[:final_row,
300
+ [
301
+ 'BigNewsDay',
302
+ 'Quarter',
303
+ 'Perf5Day',
304
+ 'Perf5Day_n1',
305
+ 'DaysGreen',
306
+ 'DaysRed',
307
+ # 'OHLC4_Trend',
308
+ # 'OHLC4_Trend_n1',
309
+ # 'OHLC4_Trend_n2',
310
+ # 'VIX5Day',
311
+ # 'VIX5Day_n1',
312
+ 'CurrentGap',
313
+ 'RangePct',
314
+ 'RangePct_n1',
315
+ 'RangePct_n2',
316
+ 'OHLC4_VIX',
317
+ 'OHLC4_VIX_n1',
318
+ 'OHLC4_VIX_n2',
319
+ 'Target',
320
+ 'Target_clf'
321
+ ]]
322
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
323
+ return data, df_final, final_row