Winston B commited on
Commit
d9c4421
·
1 Parent(s): bed31fe

Grab rid 101 1h model

Browse files
Files changed (1) hide show
  1. model_1h.py +400 -398
model_1h.py CHANGED
@@ -1,399 +1,401 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
-
20
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
21
-
22
- # Create an XGBRegressor model
23
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
24
- model = linear_model.LinearRegression()
25
-
26
- overall_results = []
27
- # Iterate over the rows in the DataFrame, one step at a time
28
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
29
- # Split the data into training and test sets
30
- X_train = df.drop(target_column, axis=1).iloc[:i]
31
- y_train = df[target_column].iloc[:i]
32
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
33
- y_test = df[target_column].iloc[i:i+num_periods]
34
-
35
- # Fit the model to the training data
36
- model.fit(X_train, y_train)
37
-
38
- # Make a prediction on the test data
39
- predictions = model.predict(X_test)
40
-
41
- # Create a DataFrame to store the true and predicted values
42
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
43
-
44
- overall_results.append(result_df)
45
-
46
- df_results = pd.concat(overall_results)
47
- # model.save_model('model_lr.bin')
48
- # Return the true and predicted values, and fitted model
49
- return df_results, model
50
-
51
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
52
-
53
- # Create run the regression model to get its target
54
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
55
- # joblib.dump(model1, 'model1.bin')
56
-
57
- # Merge the result df back on the df for feeding into the classifier
58
- for_merge = res[['Predicted']]
59
- for_merge.columns = ['RegrModelOut']
60
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
61
- df = df.merge(for_merge, left_index=True, right_index=True)
62
- df = df.drop(columns=[target_column_regr])
63
- df = df[[
64
- 'CurrentGap','RegrModelOut',
65
- 'CurrentHigh30toClose',
66
- 'CurrentLow30toClose',
67
- 'CurrentClose30toClose',
68
- 'CurrentRange30',
69
- 'GapFill30',target_column_clf
70
- ]]
71
-
72
- df[target_column_clf] = df[target_column_clf].astype(bool)
73
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
-
75
- # Create an XGBRegressor model
76
- model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
- # model = linear_model.LogisticRegression(max_iter=1500)
78
-
79
- overall_results = []
80
- # Iterate over the rows in the DataFrame, one step at a time
81
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
- # Split the data into training and test sets
83
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
- y_train = df[target_column_clf].iloc[:i]
85
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
- y_test = df[target_column_clf].iloc[i:i+num_periods]
87
-
88
- # Fit the model to the training data
89
- model2.fit(X_train, y_train)
90
-
91
- # Make a prediction on the test data
92
- predictions = model2.predict_proba(X_test)[:,-1]
93
-
94
- # Create a DataFrame to store the true and predicted values
95
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
-
97
- overall_results.append(result_df)
98
-
99
- df_results = pd.concat(overall_results)
100
- # model1.save_model('model_ensemble.bin')
101
- # joblib.dump(model2, 'model2.bin')
102
- # Return the true and predicted values, and fitted model
103
- return df_results, model1, model2
104
-
105
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
106
- regr_pred = trained_reg_model.predict(df)
107
- regr_pred = regr_pred > 0
108
- new_df = df.copy()
109
- new_df['RegrModelOut'] = regr_pred
110
- clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut',
111
- 'CurrentHigh30toClose',
112
- 'CurrentLow30toClose',
113
- 'CurrentClose30toClose',
114
- 'CurrentRange30',
115
- 'GapFill30']])[:,-1]
116
- return clf_pred_proba
117
-
118
- def get_data():
119
- # f = open('settings.json')
120
- # j = json.load(f)
121
- # API_KEY_FRED = j["API_KEY_FRED"]
122
-
123
- API_KEY_FRED = os.getenv('API_KEY_FRED')
124
-
125
- def parse_release_dates(release_id: str) -> List[str]:
126
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
127
- r = requests.get(release_dates_url)
128
- text = r.text
129
- soup = BeautifulSoup(text, 'xml')
130
- dates = []
131
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
132
- dates.append(release_date_tag.text)
133
- return dates
134
-
135
- def parse_release_dates_obs(series_id: str) -> List[str]:
136
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
137
- r = requests.get(obs_url)
138
- text = r.text
139
- soup = BeautifulSoup(text, 'xml')
140
- observations = []
141
- for observation_tag in soup.find_all('observation'):
142
- date = observation_tag.get('date')
143
- value = observation_tag.get('value')
144
- observations.append((date, value))
145
- return observations
146
-
147
- econ_dfs = {}
148
-
149
- econ_tickers = [
150
- 'WALCL',
151
- 'NFCI',
152
- 'WRESBAL'
153
- ]
154
-
155
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
156
- # p = parse_release_dates_obs(et)
157
- # df = pd.DataFrame(columns = ['ds',et], data = p)
158
- df = pdr.get_data_fred(et)
159
- df.index = df.index.rename('ds')
160
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
161
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
162
- econ_dfs[et] = df
163
-
164
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
165
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
166
-
167
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
168
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
169
-
170
- release_ids = [
171
- "10", # "Consumer Price Index"
172
- "46", # "Producer Price Index"
173
- "50", # "Employment Situation"
174
- "53", # "Gross Domestic Product"
175
- "103", # "Discount Rate Meeting Minutes"
176
- "180", # "Unemployment Insurance Weekly Claims Report"
177
- "194", # "ADP National Employment Report"
178
- "323" # "Trimmed Mean PCE Inflation Rate"
179
- ]
180
-
181
- release_names = [
182
- "CPI",
183
- "PPI",
184
- "NFP",
185
- "GDP",
186
- "FOMC",
187
- "UNEMP",
188
- "ADP",
189
- "PCE"
190
- ]
191
-
192
- releases = {}
193
-
194
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
195
- releases[rid] = {}
196
- releases[rid]['dates'] = parse_release_dates(rid)
197
- releases[rid]['name'] = n
198
-
199
- # Create a DF that has all dates with the name of the col as 1
200
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
201
- # This column serves as the true/false indicator of whether there was economic data released that day.
202
- for rid in tqdm(release_ids, desc='Making indicators'):
203
- releases[rid]['df'] = pd.DataFrame(
204
- index=releases[rid]['dates'],
205
- data={
206
- releases[rid]['name']: 1
207
- })
208
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
209
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
210
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
211
-
212
- vix = yf.Ticker('^VIX')
213
- spx = yf.Ticker('^GSPC')
214
-
215
-
216
- # Pull in data
217
- data = load_dataset("boomsss/SPX_full_30min", split='train')
218
-
219
- rows = [d['text'] for d in data]
220
- rows = [x.split(',') for x in rows]
221
-
222
- fr = pd.DataFrame(columns=[
223
- 'Datetime','Open','High','Low','Close'
224
- ], data = rows)
225
-
226
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
227
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
228
- fr = fr.set_index('Datetime')
229
- fr['Open'] = pd.to_numeric(fr['Open'])
230
- fr['High'] = pd.to_numeric(fr['High'])
231
- fr['Low'] = pd.to_numeric(fr['Low'])
232
- fr['Close'] = pd.to_numeric(fr['Close'])
233
-
234
- # Get incremental date
235
- last_date = fr.index.date[-1]
236
- last_date = last_date + datetime.timedelta(days=1)
237
- # Get incremental data
238
- spx1 = yf.Ticker('^GSPC')
239
- yfp = spx1.history(start=last_date, interval='60m')
240
- # Concat current and incremental
241
- df_30m = pd.concat([fr, yfp])
242
- # Get the first 30 minute bar
243
- df_30m = df_30m.reset_index()
244
- df_30m['Datetime'] = df_30m['Datetime'].dt.date
245
- df_30m = df_30m.groupby('Datetime').head(2)
246
- df_30m = df_30m.set_index('Datetime',drop=True)
247
- # Rename the columns
248
- df_30m = df_30m[['Open','High','Low','Close']]
249
-
250
- opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
251
- closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
252
- highs_1h = df_30m.groupby('Datetime')['High'].max()
253
- lows_1h = df_30m.groupby('Datetime')['Low'].min()
254
-
255
- df_1h = pd.DataFrame(index=df_30m.index.unique())
256
- df_1h['Open'] = opens_1h
257
- df_1h['Close'] = closes_1h
258
- df_1h['High'] = highs_1h
259
- df_1h['Low'] = lows_1h
260
-
261
- df_1h.columns = ['Open30','High30','Low30','Close30']
262
-
263
- prices_vix = vix.history(start='2018-07-01', interval='1d')
264
- prices_spx = spx.history(start='2018-07-01', interval='1d')
265
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
266
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
267
- prices_spx.index = prices_spx['index']
268
- prices_spx = prices_spx.drop(columns='index')
269
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
270
-
271
-
272
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
273
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
274
- prices_vix.index = prices_vix['index']
275
- prices_vix = prices_vix.drop(columns='index')
276
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
277
-
278
-
279
- data = prices_spx.merge(df_1h, left_index=True, right_index=True)
280
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
281
-
282
- # Features
283
- data['PrevClose'] = data['Close'].shift(1)
284
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
285
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
286
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
287
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
288
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
289
-
290
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
291
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
292
-
293
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
294
- data['RangePct'] = data['Range'] / data['Close']
295
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
296
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
297
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
298
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
299
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
300
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
301
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
302
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
303
- data['RangePct_n1'] = data['RangePct'].shift(1)
304
- data['RangePct_n2'] = data['RangePct'].shift(2)
305
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
306
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
307
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
308
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
309
- data['DayOfWeek'] = pd.to_datetime(data.index)
310
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
311
-
312
- # Intraday features
313
- data['CurrentHigh30'] = data['High30'].shift(-1)
314
- data['CurrentLow30'] = data['Low30'].shift(-1)
315
- data['CurrentClose30'] = data['Close30'].shift(-1)
316
-
317
- # Open to High
318
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
319
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
320
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
321
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
322
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
323
-
324
- # Target -- the next day's low
325
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
326
- data['Target'] = data['Target'].shift(-1)
327
- # data['Target'] = data['RangePct'].shift(-1)
328
-
329
- # Target for clf -- whether tomorrow will close above or below today's close
330
- data['Target_clf'] = data['Close'] > data['PrevClose']
331
- data['Target_clf'] = data['Target_clf'].shift(-1)
332
- data['DayOfWeek'] = pd.to_datetime(data.index)
333
- data['Quarter'] = data['DayOfWeek'].dt.quarter
334
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
335
-
336
- for rid in tqdm(release_ids, desc='Merging econ data'):
337
- # Get the name of the release
338
- n = releases[rid]['name']
339
- # Merge the corresponding DF of the release
340
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
341
- # Create a column that shifts the value in the merged column up by 1
342
- data[f'{n}_shift'] = data[n].shift(-1)
343
- # Fill the rest with zeroes
344
- data[n] = data[n].fillna(0)
345
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
346
-
347
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
348
-
349
- def cumul_sum(col):
350
- nums = []
351
- s = 0
352
- for x in col:
353
- if x == 1:
354
- s += 1
355
- elif x == 0:
356
- s = 0
357
- nums.append(s)
358
- return nums
359
-
360
- consec_green = cumul_sum(data['GreenDay'].values)
361
- consec_red = cumul_sum(data['RedDay'].values)
362
-
363
- data['DaysGreen'] = consec_green
364
- data['DaysRed'] = consec_red
365
-
366
- final_row = data.index[-2]
367
-
368
- exp_row = data.index[-1]
369
-
370
- df_final = data.loc[:final_row,
371
- [
372
- 'BigNewsDay',
373
- 'Quarter',
374
- 'Perf5Day',
375
- 'Perf5Day_n1',
376
- 'DaysGreen',
377
- 'DaysRed',
378
- 'CurrentHigh30toClose',
379
- 'CurrentLow30toClose',
380
- 'CurrentClose30toClose',
381
- 'CurrentRange30',
382
- 'GapFill30',
383
- # 'OHLC4_Trend',
384
- # 'OHLC4_Trend_n1',
385
- # 'OHLC4_Trend_n2',
386
- # 'VIX5Day',
387
- # 'VIX5Day_n1',
388
- 'CurrentGap',
389
- 'RangePct',
390
- 'RangePct_n1',
391
- 'RangePct_n2',
392
- 'OHLC4_VIX',
393
- 'OHLC4_VIX_n1',
394
- 'OHLC4_VIX_n2',
395
- 'Target',
396
- 'Target_clf'
397
- ]]
398
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
 
 
399
  return data, df_final, final_row
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
21
+
22
+ # Create an XGBRegressor model
23
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
24
+ model = linear_model.LinearRegression()
25
+
26
+ overall_results = []
27
+ # Iterate over the rows in the DataFrame, one step at a time
28
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
29
+ # Split the data into training and test sets
30
+ X_train = df.drop(target_column, axis=1).iloc[:i]
31
+ y_train = df[target_column].iloc[:i]
32
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
33
+ y_test = df[target_column].iloc[i:i+num_periods]
34
+
35
+ # Fit the model to the training data
36
+ model.fit(X_train, y_train)
37
+
38
+ # Make a prediction on the test data
39
+ predictions = model.predict(X_test)
40
+
41
+ # Create a DataFrame to store the true and predicted values
42
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
43
+
44
+ overall_results.append(result_df)
45
+
46
+ df_results = pd.concat(overall_results)
47
+ # model.save_model('model_lr.bin')
48
+ # Return the true and predicted values, and fitted model
49
+ return df_results, model
50
+
51
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
52
+
53
+ # Create run the regression model to get its target
54
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
55
+ # joblib.dump(model1, 'model1.bin')
56
+
57
+ # Merge the result df back on the df for feeding into the classifier
58
+ for_merge = res[['Predicted']]
59
+ for_merge.columns = ['RegrModelOut']
60
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
61
+ df = df.merge(for_merge, left_index=True, right_index=True)
62
+ df = df.drop(columns=[target_column_regr])
63
+ df = df[[
64
+ 'CurrentGap','RegrModelOut',
65
+ 'CurrentHigh30toClose',
66
+ 'CurrentLow30toClose',
67
+ 'CurrentClose30toClose',
68
+ 'CurrentRange30',
69
+ 'GapFill30',target_column_clf
70
+ ]]
71
+
72
+ df[target_column_clf] = df[target_column_clf].astype(bool)
73
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
+
75
+ # Create an XGBRegressor model
76
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
+ # model = linear_model.LogisticRegression(max_iter=1500)
78
+
79
+ overall_results = []
80
+ # Iterate over the rows in the DataFrame, one step at a time
81
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
+ # Split the data into training and test sets
83
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
+ y_train = df[target_column_clf].iloc[:i]
85
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
87
+
88
+ # Fit the model to the training data
89
+ model2.fit(X_train, y_train)
90
+
91
+ # Make a prediction on the test data
92
+ predictions = model2.predict_proba(X_test)[:,-1]
93
+
94
+ # Create a DataFrame to store the true and predicted values
95
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
+
97
+ overall_results.append(result_df)
98
+
99
+ df_results = pd.concat(overall_results)
100
+ # model1.save_model('model_ensemble.bin')
101
+ # joblib.dump(model2, 'model2.bin')
102
+ # Return the true and predicted values, and fitted model
103
+ return df_results, model1, model2
104
+
105
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
106
+ regr_pred = trained_reg_model.predict(df)
107
+ regr_pred = regr_pred > 0
108
+ new_df = df.copy()
109
+ new_df['RegrModelOut'] = regr_pred
110
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut',
111
+ 'CurrentHigh30toClose',
112
+ 'CurrentLow30toClose',
113
+ 'CurrentClose30toClose',
114
+ 'CurrentRange30',
115
+ 'GapFill30']])[:,-1]
116
+ return clf_pred_proba
117
+
118
+ def get_data():
119
+ # f = open('settings.json')
120
+ # j = json.load(f)
121
+ # API_KEY_FRED = j["API_KEY_FRED"]
122
+
123
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
124
+
125
+ def parse_release_dates(release_id: str) -> List[str]:
126
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
127
+ r = requests.get(release_dates_url)
128
+ text = r.text
129
+ soup = BeautifulSoup(text, 'xml')
130
+ dates = []
131
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
132
+ dates.append(release_date_tag.text)
133
+ return dates
134
+
135
+ def parse_release_dates_obs(series_id: str) -> List[str]:
136
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
137
+ r = requests.get(obs_url)
138
+ text = r.text
139
+ soup = BeautifulSoup(text, 'xml')
140
+ observations = []
141
+ for observation_tag in soup.find_all('observation'):
142
+ date = observation_tag.get('date')
143
+ value = observation_tag.get('value')
144
+ observations.append((date, value))
145
+ return observations
146
+
147
+ econ_dfs = {}
148
+
149
+ econ_tickers = [
150
+ 'WALCL',
151
+ 'NFCI',
152
+ 'WRESBAL'
153
+ ]
154
+
155
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
156
+ # p = parse_release_dates_obs(et)
157
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
158
+ df = pdr.get_data_fred(et)
159
+ df.index = df.index.rename('ds')
160
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
161
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
162
+ econ_dfs[et] = df
163
+
164
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
165
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
166
+
167
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
168
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
169
+
170
+ release_ids = [
171
+ "10", # "Consumer Price Index"
172
+ "46", # "Producer Price Index"
173
+ "50", # "Employment Situation"
174
+ "53", # "Gross Domestic Product"
175
+ "101", # "FOMC press release"
176
+ "103", # "Discount Rate Meeting Minutes"
177
+ "180", # "Unemployment Insurance Weekly Claims Report"
178
+ "194", # "ADP National Employment Report"
179
+ "323" # "Trimmed Mean PCE Inflation Rate"
180
+ ]
181
+
182
+ release_names = [
183
+ "CPI",
184
+ "PPI",
185
+ "NFP",
186
+ "GDP",
187
+ "FOMCPR",
188
+ "FOMC",
189
+ "UNEMP",
190
+ "ADP",
191
+ "PCE"
192
+ ]
193
+
194
+ releases = {}
195
+
196
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
197
+ releases[rid] = {}
198
+ releases[rid]['dates'] = parse_release_dates(rid)
199
+ releases[rid]['name'] = n
200
+
201
+ # Create a DF that has all dates with the name of the col as 1
202
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
203
+ # This column serves as the true/false indicator of whether there was economic data released that day.
204
+ for rid in tqdm(release_ids, desc='Making indicators'):
205
+ releases[rid]['df'] = pd.DataFrame(
206
+ index=releases[rid]['dates'],
207
+ data={
208
+ releases[rid]['name']: 1
209
+ })
210
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
211
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
212
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
213
+
214
+ vix = yf.Ticker('^VIX')
215
+ spx = yf.Ticker('^GSPC')
216
+
217
+
218
+ # Pull in data
219
+ data = load_dataset("boomsss/SPX_full_30min", split='train')
220
+
221
+ rows = [d['text'] for d in data]
222
+ rows = [x.split(',') for x in rows]
223
+
224
+ fr = pd.DataFrame(columns=[
225
+ 'Datetime','Open','High','Low','Close'
226
+ ], data = rows)
227
+
228
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
229
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
230
+ fr = fr.set_index('Datetime')
231
+ fr['Open'] = pd.to_numeric(fr['Open'])
232
+ fr['High'] = pd.to_numeric(fr['High'])
233
+ fr['Low'] = pd.to_numeric(fr['Low'])
234
+ fr['Close'] = pd.to_numeric(fr['Close'])
235
+
236
+ # Get incremental date
237
+ last_date = fr.index.date[-1]
238
+ last_date = last_date + datetime.timedelta(days=1)
239
+ # Get incremental data
240
+ spx1 = yf.Ticker('^GSPC')
241
+ yfp = spx1.history(start=last_date, interval='60m')
242
+ # Concat current and incremental
243
+ df_30m = pd.concat([fr, yfp])
244
+ # Get the first 30 minute bar
245
+ df_30m = df_30m.reset_index()
246
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
247
+ df_30m = df_30m.groupby('Datetime').head(2)
248
+ df_30m = df_30m.set_index('Datetime',drop=True)
249
+ # Rename the columns
250
+ df_30m = df_30m[['Open','High','Low','Close']]
251
+
252
+ opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
253
+ closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
254
+ highs_1h = df_30m.groupby('Datetime')['High'].max()
255
+ lows_1h = df_30m.groupby('Datetime')['Low'].min()
256
+
257
+ df_1h = pd.DataFrame(index=df_30m.index.unique())
258
+ df_1h['Open'] = opens_1h
259
+ df_1h['Close'] = closes_1h
260
+ df_1h['High'] = highs_1h
261
+ df_1h['Low'] = lows_1h
262
+
263
+ df_1h.columns = ['Open30','High30','Low30','Close30']
264
+
265
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
266
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
267
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
268
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
269
+ prices_spx.index = prices_spx['index']
270
+ prices_spx = prices_spx.drop(columns='index')
271
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
272
+
273
+
274
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
275
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
276
+ prices_vix.index = prices_vix['index']
277
+ prices_vix = prices_vix.drop(columns='index')
278
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
279
+
280
+
281
+ data = prices_spx.merge(df_1h, left_index=True, right_index=True)
282
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
283
+
284
+ # Features
285
+ data['PrevClose'] = data['Close'].shift(1)
286
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
287
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
288
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
289
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
290
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
291
+
292
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
293
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
294
+
295
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
296
+ data['RangePct'] = data['Range'] / data['Close']
297
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
298
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
299
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
300
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
301
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
302
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
303
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
304
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
305
+ data['RangePct_n1'] = data['RangePct'].shift(1)
306
+ data['RangePct_n2'] = data['RangePct'].shift(2)
307
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
308
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
309
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
310
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
311
+ data['DayOfWeek'] = pd.to_datetime(data.index)
312
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
313
+
314
+ # Intraday features
315
+ data['CurrentHigh30'] = data['High30'].shift(-1)
316
+ data['CurrentLow30'] = data['Low30'].shift(-1)
317
+ data['CurrentClose30'] = data['Close30'].shift(-1)
318
+
319
+ # Open to High
320
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
321
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
322
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
323
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
324
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
325
+
326
+ # Target -- the next day's low
327
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
328
+ data['Target'] = data['Target'].shift(-1)
329
+ # data['Target'] = data['RangePct'].shift(-1)
330
+
331
+ # Target for clf -- whether tomorrow will close above or below today's close
332
+ data['Target_clf'] = data['Close'] > data['PrevClose']
333
+ data['Target_clf'] = data['Target_clf'].shift(-1)
334
+ data['DayOfWeek'] = pd.to_datetime(data.index)
335
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
336
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
337
+
338
+ for rid in tqdm(release_ids, desc='Merging econ data'):
339
+ # Get the name of the release
340
+ n = releases[rid]['name']
341
+ # Merge the corresponding DF of the release
342
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
343
+ # Create a column that shifts the value in the merged column up by 1
344
+ data[f'{n}_shift'] = data[n].shift(-1)
345
+ # Fill the rest with zeroes
346
+ data[n] = data[n].fillna(0)
347
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
348
+
349
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
350
+
351
+ def cumul_sum(col):
352
+ nums = []
353
+ s = 0
354
+ for x in col:
355
+ if x == 1:
356
+ s += 1
357
+ elif x == 0:
358
+ s = 0
359
+ nums.append(s)
360
+ return nums
361
+
362
+ consec_green = cumul_sum(data['GreenDay'].values)
363
+ consec_red = cumul_sum(data['RedDay'].values)
364
+
365
+ data['DaysGreen'] = consec_green
366
+ data['DaysRed'] = consec_red
367
+
368
+ final_row = data.index[-2]
369
+
370
+ exp_row = data.index[-1]
371
+
372
+ df_final = data.loc[:final_row,
373
+ [
374
+ 'BigNewsDay',
375
+ 'Quarter',
376
+ 'Perf5Day',
377
+ 'Perf5Day_n1',
378
+ 'DaysGreen',
379
+ 'DaysRed',
380
+ 'CurrentHigh30toClose',
381
+ 'CurrentLow30toClose',
382
+ 'CurrentClose30toClose',
383
+ 'CurrentRange30',
384
+ 'GapFill30',
385
+ # 'OHLC4_Trend',
386
+ # 'OHLC4_Trend_n1',
387
+ # 'OHLC4_Trend_n2',
388
+ # 'VIX5Day',
389
+ # 'VIX5Day_n1',
390
+ 'CurrentGap',
391
+ 'RangePct',
392
+ 'RangePct_n1',
393
+ 'RangePct_n2',
394
+ 'OHLC4_VIX',
395
+ 'OHLC4_VIX_n1',
396
+ 'OHLC4_VIX_n2',
397
+ 'Target',
398
+ 'Target_clf'
399
+ ]]
400
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
401
  return data, df_final, final_row