Winston B commited on
Commit
9fe24eb
·
1 Parent(s): d9c4421

Grab rid 101 30m

Browse files
Files changed (1) hide show
  1. model_30m.py +387 -385
model_30m.py CHANGED
@@ -1,386 +1,388 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
-
20
- # If the dataset is gated/private, make sure you have run huggingface-cli login
21
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
22
-
23
- # Create an XGBRegressor model
24
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
25
- model = linear_model.LinearRegression()
26
-
27
- overall_results = []
28
- # Iterate over the rows in the DataFrame, one step at a time
29
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
30
- # Split the data into training and test sets
31
- X_train = df.drop(target_column, axis=1).iloc[:i]
32
- y_train = df[target_column].iloc[:i]
33
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
34
- y_test = df[target_column].iloc[i:i+num_periods]
35
-
36
- # Fit the model to the training data
37
- model.fit(X_train, y_train)
38
-
39
- # Make a prediction on the test data
40
- predictions = model.predict(X_test)
41
-
42
- # Create a DataFrame to store the true and predicted values
43
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
-
45
- overall_results.append(result_df)
46
-
47
- df_results = pd.concat(overall_results)
48
- # model.save_model('model_lr.bin')
49
- # Return the true and predicted values, and fitted model
50
- return df_results, model
51
-
52
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
53
-
54
- # Create run the regression model to get its target
55
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
56
- # joblib.dump(model1, 'model1.bin')
57
-
58
- # Merge the result df back on the df for feeding into the classifier
59
- for_merge = res[['Predicted']]
60
- for_merge.columns = ['RegrModelOut']
61
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
62
- df = df.merge(for_merge, left_index=True, right_index=True)
63
- df = df.drop(columns=[target_column_regr])
64
- df = df[[
65
- 'CurrentGap','RegrModelOut','CurrentHigh30toClose',
66
- 'CurrentLow30toClose',
67
- 'CurrentClose30toClose',
68
- 'CurrentRange30',
69
- 'GapFill30', target_column_clf
70
- ]]
71
-
72
- df[target_column_clf] = df[target_column_clf].astype(bool)
73
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
-
75
- # Create an XGBRegressor model
76
- model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
- # model = linear_model.LogisticRegression(max_iter=1500)
78
-
79
- overall_results = []
80
- # Iterate over the rows in the DataFrame, one step at a time
81
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
- # Split the data into training and test sets
83
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
- y_train = df[target_column_clf].iloc[:i]
85
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
- y_test = df[target_column_clf].iloc[i:i+num_periods]
87
-
88
- # Fit the model to the training data
89
- model2.fit(X_train, y_train)
90
-
91
- # Make a prediction on the test data
92
- predictions = model2.predict_proba(X_test)[:,-1]
93
-
94
- # Create a DataFrame to store the true and predicted values
95
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
-
97
- overall_results.append(result_df)
98
-
99
- df_results = pd.concat(overall_results)
100
- return df_results, model1, model2
101
-
102
-
103
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
- regr_pred = trained_reg_model.predict(df)
105
- regr_pred = regr_pred > 0
106
- new_df = df.copy()
107
- new_df['RegrModelOut'] = regr_pred
108
- clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut','CurrentHigh30toClose',
109
- 'CurrentLow30toClose',
110
- 'CurrentClose30toClose',
111
- 'CurrentRange30',
112
- 'GapFill30']])[:,-1]
113
- return clf_pred_proba
114
-
115
- def get_data():
116
- # f = open('settings.json')
117
- # j = json.load(f)
118
- # API_KEY_FRED = j["API_KEY_FRED"]
119
-
120
- API_KEY_FRED = os.getenv('API_KEY_FRED')
121
-
122
- def parse_release_dates(release_id: str) -> List[str]:
123
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
124
- r = requests.get(release_dates_url)
125
- text = r.text
126
- soup = BeautifulSoup(text, 'xml')
127
- dates = []
128
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
129
- dates.append(release_date_tag.text)
130
- return dates
131
-
132
- def parse_release_dates_obs(series_id: str) -> List[str]:
133
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
134
- r = requests.get(obs_url)
135
- text = r.text
136
- soup = BeautifulSoup(text, 'xml')
137
- observations = []
138
- for observation_tag in soup.find_all('observation'):
139
- date = observation_tag.get('date')
140
- value = observation_tag.get('value')
141
- observations.append((date, value))
142
- return observations
143
-
144
- econ_dfs = {}
145
-
146
- econ_tickers = [
147
- 'WALCL',
148
- 'NFCI',
149
- 'WRESBAL'
150
- ]
151
-
152
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
153
- # p = parse_release_dates_obs(et)
154
- # df = pd.DataFrame(columns = ['ds',et], data = p)
155
- df = pdr.get_data_fred(et)
156
- df.index = df.index.rename('ds')
157
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
158
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
159
- econ_dfs[et] = df
160
-
161
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
162
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
163
-
164
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
165
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
166
-
167
- release_ids = [
168
- "10", # "Consumer Price Index"
169
- "46", # "Producer Price Index"
170
- "50", # "Employment Situation"
171
- "53", # "Gross Domestic Product"
172
- "103", # "Discount Rate Meeting Minutes"
173
- "180", # "Unemployment Insurance Weekly Claims Report"
174
- "194", # "ADP National Employment Report"
175
- "323" # "Trimmed Mean PCE Inflation Rate"
176
- ]
177
-
178
- release_names = [
179
- "CPI",
180
- "PPI",
181
- "NFP",
182
- "GDP",
183
- "FOMC",
184
- "UNEMP",
185
- "ADP",
186
- "PCE"
187
- ]
188
-
189
- releases = {}
190
-
191
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
192
- releases[rid] = {}
193
- releases[rid]['dates'] = parse_release_dates(rid)
194
- releases[rid]['name'] = n
195
-
196
- # Create a DF that has all dates with the name of the col as 1
197
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
198
- # This column serves as the true/false indicator of whether there was economic data released that day.
199
- for rid in tqdm(release_ids, desc='Making indicators'):
200
- releases[rid]['df'] = pd.DataFrame(
201
- index=releases[rid]['dates'],
202
- data={
203
- releases[rid]['name']: 1
204
- })
205
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
206
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
207
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
208
-
209
- vix = yf.Ticker('^VIX')
210
- spx = yf.Ticker('^GSPC')
211
-
212
- # Pull in data
213
- data = load_dataset("boomsss/SPX_full_30min", split='train')
214
-
215
- rows = [d['text'] for d in data]
216
- rows = [x.split(',') for x in rows]
217
-
218
- fr = pd.DataFrame(columns=[
219
- 'Datetime','Open','High','Low','Close'
220
- ], data = rows)
221
-
222
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
223
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
224
- fr = fr.set_index('Datetime')
225
- fr['Open'] = pd.to_numeric(fr['Open'])
226
- fr['High'] = pd.to_numeric(fr['High'])
227
- fr['Low'] = pd.to_numeric(fr['Low'])
228
- fr['Close'] = pd.to_numeric(fr['Close'])
229
-
230
- # Set index for ready to concat
231
-
232
-
233
- # Get incremental date
234
- last_date = fr.index.date[-1]
235
- last_date = last_date + datetime.timedelta(days=1)
236
- # Get incremental data
237
- spx1 = yf.Ticker('^GSPC')
238
- yfp = spx1.history(start=last_date, interval='30m')
239
- # Concat current and incremental
240
- df_30m = pd.concat([fr, yfp])
241
- # Get the first 30 minute bar
242
- df_30m = df_30m.reset_index()
243
- df_30m['Datetime'] = df_30m['Datetime'].dt.date
244
- df_30m = df_30m.groupby('Datetime').head(1)
245
- df_30m = df_30m.set_index('Datetime',drop=True)
246
- # Rename the columns
247
- df_30m = df_30m[['Open','High','Low','Close']]
248
- df_30m.columns = ['Open30','High30','Low30','Close30']
249
-
250
- prices_vix = vix.history(start='2018-07-01', interval='1d')
251
- prices_spx = spx.history(start='2018-07-01', interval='1d')
252
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
253
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
254
- prices_spx.index = prices_spx['index']
255
- prices_spx = prices_spx.drop(columns='index')
256
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
257
-
258
-
259
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
260
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
261
- prices_vix.index = prices_vix['index']
262
- prices_vix = prices_vix.drop(columns='index')
263
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
264
-
265
-
266
- data = prices_spx.merge(df_30m, left_index=True, right_index=True)
267
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
268
-
269
- # Features
270
- data['PrevClose'] = data['Close'].shift(1)
271
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
272
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
273
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
274
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
275
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
276
-
277
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
278
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
279
-
280
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
281
- data['RangePct'] = data['Range'] / data['Close']
282
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
283
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
284
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
285
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
286
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
287
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
288
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
289
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
290
- data['RangePct_n1'] = data['RangePct'].shift(1)
291
- data['RangePct_n2'] = data['RangePct'].shift(2)
292
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
293
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
294
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
295
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
296
- data['DayOfWeek'] = pd.to_datetime(data.index)
297
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
298
-
299
- # Intraday features
300
- data['CurrentHigh30'] = data['High30'].shift(-1)
301
- data['CurrentLow30'] = data['Low30'].shift(-1)
302
- data['CurrentClose30'] = data['Close30'].shift(-1)
303
-
304
- # Open to High
305
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
306
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
307
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
308
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
309
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
310
-
311
- # Target -- the next day's low
312
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
313
- data['Target'] = data['Target'].shift(-1)
314
- # data['Target'] = data['RangePct'].shift(-1)
315
-
316
- # Target for clf -- whether tomorrow will close above or below today's close
317
- data['Target_clf'] = data['Close'] > data['PrevClose']
318
- data['Target_clf'] = data['Target_clf'].shift(-1)
319
- data['DayOfWeek'] = pd.to_datetime(data.index)
320
- data['Quarter'] = data['DayOfWeek'].dt.quarter
321
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
322
-
323
- for rid in tqdm(release_ids, desc='Merging econ data'):
324
- # Get the name of the release
325
- n = releases[rid]['name']
326
- # Merge the corresponding DF of the release
327
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
328
- # Create a column that shifts the value in the merged column up by 1
329
- data[f'{n}_shift'] = data[n].shift(-1)
330
- # Fill the rest with zeroes
331
- data[n] = data[n].fillna(0)
332
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
333
-
334
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
335
-
336
- def cumul_sum(col):
337
- nums = []
338
- s = 0
339
- for x in col:
340
- if x == 1:
341
- s += 1
342
- elif x == 0:
343
- s = 0
344
- nums.append(s)
345
- return nums
346
-
347
- consec_green = cumul_sum(data['GreenDay'].values)
348
- consec_red = cumul_sum(data['RedDay'].values)
349
-
350
- data['DaysGreen'] = consec_green
351
- data['DaysRed'] = consec_red
352
-
353
- final_row = data.index[-2]
354
-
355
- exp_row = data.index[-1]
356
-
357
- df_final = data.loc[:final_row,
358
- [
359
- 'BigNewsDay',
360
- 'Quarter',
361
- 'Perf5Day',
362
- 'Perf5Day_n1',
363
- 'DaysGreen',
364
- 'DaysRed',
365
- 'CurrentHigh30toClose',
366
- 'CurrentLow30toClose',
367
- 'CurrentClose30toClose',
368
- 'CurrentRange30',
369
- 'GapFill30',
370
- # 'OHLC4_Trend',
371
- # 'OHLC4_Trend_n1',
372
- # 'OHLC4_Trend_n2',
373
- # 'VIX5Day',
374
- # 'VIX5Day_n1',
375
- 'CurrentGap',
376
- 'RangePct',
377
- 'RangePct_n1',
378
- 'RangePct_n2',
379
- 'OHLC4_VIX',
380
- 'OHLC4_VIX_n1',
381
- 'OHLC4_VIX_n2',
382
- 'Target',
383
- 'Target_clf'
384
- ]]
385
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
 
 
386
  return data, df_final, final_row
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
21
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
22
+
23
+ # Create an XGBRegressor model
24
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
25
+ model = linear_model.LinearRegression()
26
+
27
+ overall_results = []
28
+ # Iterate over the rows in the DataFrame, one step at a time
29
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
30
+ # Split the data into training and test sets
31
+ X_train = df.drop(target_column, axis=1).iloc[:i]
32
+ y_train = df[target_column].iloc[:i]
33
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
34
+ y_test = df[target_column].iloc[i:i+num_periods]
35
+
36
+ # Fit the model to the training data
37
+ model.fit(X_train, y_train)
38
+
39
+ # Make a prediction on the test data
40
+ predictions = model.predict(X_test)
41
+
42
+ # Create a DataFrame to store the true and predicted values
43
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
+
45
+ overall_results.append(result_df)
46
+
47
+ df_results = pd.concat(overall_results)
48
+ # model.save_model('model_lr.bin')
49
+ # Return the true and predicted values, and fitted model
50
+ return df_results, model
51
+
52
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
53
+
54
+ # Create run the regression model to get its target
55
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
56
+ # joblib.dump(model1, 'model1.bin')
57
+
58
+ # Merge the result df back on the df for feeding into the classifier
59
+ for_merge = res[['Predicted']]
60
+ for_merge.columns = ['RegrModelOut']
61
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
62
+ df = df.merge(for_merge, left_index=True, right_index=True)
63
+ df = df.drop(columns=[target_column_regr])
64
+ df = df[[
65
+ 'CurrentGap','RegrModelOut','CurrentHigh30toClose',
66
+ 'CurrentLow30toClose',
67
+ 'CurrentClose30toClose',
68
+ 'CurrentRange30',
69
+ 'GapFill30', target_column_clf
70
+ ]]
71
+
72
+ df[target_column_clf] = df[target_column_clf].astype(bool)
73
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
+
75
+ # Create an XGBRegressor model
76
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
+ # model = linear_model.LogisticRegression(max_iter=1500)
78
+
79
+ overall_results = []
80
+ # Iterate over the rows in the DataFrame, one step at a time
81
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
+ # Split the data into training and test sets
83
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
+ y_train = df[target_column_clf].iloc[:i]
85
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
87
+
88
+ # Fit the model to the training data
89
+ model2.fit(X_train, y_train)
90
+
91
+ # Make a prediction on the test data
92
+ predictions = model2.predict_proba(X_test)[:,-1]
93
+
94
+ # Create a DataFrame to store the true and predicted values
95
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
+
97
+ overall_results.append(result_df)
98
+
99
+ df_results = pd.concat(overall_results)
100
+ return df_results, model1, model2
101
+
102
+
103
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
+ regr_pred = trained_reg_model.predict(df)
105
+ regr_pred = regr_pred > 0
106
+ new_df = df.copy()
107
+ new_df['RegrModelOut'] = regr_pred
108
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut','CurrentHigh30toClose',
109
+ 'CurrentLow30toClose',
110
+ 'CurrentClose30toClose',
111
+ 'CurrentRange30',
112
+ 'GapFill30']])[:,-1]
113
+ return clf_pred_proba
114
+
115
+ def get_data():
116
+ # f = open('settings.json')
117
+ # j = json.load(f)
118
+ # API_KEY_FRED = j["API_KEY_FRED"]
119
+
120
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
121
+
122
+ def parse_release_dates(release_id: str) -> List[str]:
123
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
124
+ r = requests.get(release_dates_url)
125
+ text = r.text
126
+ soup = BeautifulSoup(text, 'xml')
127
+ dates = []
128
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
129
+ dates.append(release_date_tag.text)
130
+ return dates
131
+
132
+ def parse_release_dates_obs(series_id: str) -> List[str]:
133
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
134
+ r = requests.get(obs_url)
135
+ text = r.text
136
+ soup = BeautifulSoup(text, 'xml')
137
+ observations = []
138
+ for observation_tag in soup.find_all('observation'):
139
+ date = observation_tag.get('date')
140
+ value = observation_tag.get('value')
141
+ observations.append((date, value))
142
+ return observations
143
+
144
+ econ_dfs = {}
145
+
146
+ econ_tickers = [
147
+ 'WALCL',
148
+ 'NFCI',
149
+ 'WRESBAL'
150
+ ]
151
+
152
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
153
+ # p = parse_release_dates_obs(et)
154
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
155
+ df = pdr.get_data_fred(et)
156
+ df.index = df.index.rename('ds')
157
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
158
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
159
+ econ_dfs[et] = df
160
+
161
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
162
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
163
+
164
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
165
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
166
+
167
+ release_ids = [
168
+ "10", # "Consumer Price Index"
169
+ "46", # "Producer Price Index"
170
+ "50", # "Employment Situation"
171
+ "53", # "Gross Domestic Product"
172
+ "101", # "FOMC press release"
173
+ "103", # "Discount Rate Meeting Minutes"
174
+ "180", # "Unemployment Insurance Weekly Claims Report"
175
+ "194", # "ADP National Employment Report"
176
+ "323" # "Trimmed Mean PCE Inflation Rate"
177
+ ]
178
+
179
+ release_names = [
180
+ "CPI",
181
+ "PPI",
182
+ "NFP",
183
+ "GDP",
184
+ "FOMCPR",
185
+ "FOMC",
186
+ "UNEMP",
187
+ "ADP",
188
+ "PCE"
189
+ ]
190
+
191
+ releases = {}
192
+
193
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
194
+ releases[rid] = {}
195
+ releases[rid]['dates'] = parse_release_dates(rid)
196
+ releases[rid]['name'] = n
197
+
198
+ # Create a DF that has all dates with the name of the col as 1
199
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
200
+ # This column serves as the true/false indicator of whether there was economic data released that day.
201
+ for rid in tqdm(release_ids, desc='Making indicators'):
202
+ releases[rid]['df'] = pd.DataFrame(
203
+ index=releases[rid]['dates'],
204
+ data={
205
+ releases[rid]['name']: 1
206
+ })
207
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
208
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
209
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
210
+
211
+ vix = yf.Ticker('^VIX')
212
+ spx = yf.Ticker('^GSPC')
213
+
214
+ # Pull in data
215
+ data = load_dataset("boomsss/SPX_full_30min", split='train')
216
+
217
+ rows = [d['text'] for d in data]
218
+ rows = [x.split(',') for x in rows]
219
+
220
+ fr = pd.DataFrame(columns=[
221
+ 'Datetime','Open','High','Low','Close'
222
+ ], data = rows)
223
+
224
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
225
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
226
+ fr = fr.set_index('Datetime')
227
+ fr['Open'] = pd.to_numeric(fr['Open'])
228
+ fr['High'] = pd.to_numeric(fr['High'])
229
+ fr['Low'] = pd.to_numeric(fr['Low'])
230
+ fr['Close'] = pd.to_numeric(fr['Close'])
231
+
232
+ # Set index for ready to concat
233
+
234
+
235
+ # Get incremental date
236
+ last_date = fr.index.date[-1]
237
+ last_date = last_date + datetime.timedelta(days=1)
238
+ # Get incremental data
239
+ spx1 = yf.Ticker('^GSPC')
240
+ yfp = spx1.history(start=last_date, interval='30m')
241
+ # Concat current and incremental
242
+ df_30m = pd.concat([fr, yfp])
243
+ # Get the first 30 minute bar
244
+ df_30m = df_30m.reset_index()
245
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
246
+ df_30m = df_30m.groupby('Datetime').head(1)
247
+ df_30m = df_30m.set_index('Datetime',drop=True)
248
+ # Rename the columns
249
+ df_30m = df_30m[['Open','High','Low','Close']]
250
+ df_30m.columns = ['Open30','High30','Low30','Close30']
251
+
252
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
253
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
254
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
255
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
256
+ prices_spx.index = prices_spx['index']
257
+ prices_spx = prices_spx.drop(columns='index')
258
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
259
+
260
+
261
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
262
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
263
+ prices_vix.index = prices_vix['index']
264
+ prices_vix = prices_vix.drop(columns='index')
265
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
266
+
267
+
268
+ data = prices_spx.merge(df_30m, left_index=True, right_index=True)
269
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
270
+
271
+ # Features
272
+ data['PrevClose'] = data['Close'].shift(1)
273
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
274
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
275
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
276
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
277
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
278
+
279
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
280
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
281
+
282
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
283
+ data['RangePct'] = data['Range'] / data['Close']
284
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
285
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
286
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
287
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
288
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
289
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
290
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
291
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
292
+ data['RangePct_n1'] = data['RangePct'].shift(1)
293
+ data['RangePct_n2'] = data['RangePct'].shift(2)
294
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
295
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
296
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
297
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
298
+ data['DayOfWeek'] = pd.to_datetime(data.index)
299
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
300
+
301
+ # Intraday features
302
+ data['CurrentHigh30'] = data['High30'].shift(-1)
303
+ data['CurrentLow30'] = data['Low30'].shift(-1)
304
+ data['CurrentClose30'] = data['Close30'].shift(-1)
305
+
306
+ # Open to High
307
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
308
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
309
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
310
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
311
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
312
+
313
+ # Target -- the next day's low
314
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
315
+ data['Target'] = data['Target'].shift(-1)
316
+ # data['Target'] = data['RangePct'].shift(-1)
317
+
318
+ # Target for clf -- whether tomorrow will close above or below today's close
319
+ data['Target_clf'] = data['Close'] > data['PrevClose']
320
+ data['Target_clf'] = data['Target_clf'].shift(-1)
321
+ data['DayOfWeek'] = pd.to_datetime(data.index)
322
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
323
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
324
+
325
+ for rid in tqdm(release_ids, desc='Merging econ data'):
326
+ # Get the name of the release
327
+ n = releases[rid]['name']
328
+ # Merge the corresponding DF of the release
329
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
330
+ # Create a column that shifts the value in the merged column up by 1
331
+ data[f'{n}_shift'] = data[n].shift(-1)
332
+ # Fill the rest with zeroes
333
+ data[n] = data[n].fillna(0)
334
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
335
+
336
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
337
+
338
+ def cumul_sum(col):
339
+ nums = []
340
+ s = 0
341
+ for x in col:
342
+ if x == 1:
343
+ s += 1
344
+ elif x == 0:
345
+ s = 0
346
+ nums.append(s)
347
+ return nums
348
+
349
+ consec_green = cumul_sum(data['GreenDay'].values)
350
+ consec_red = cumul_sum(data['RedDay'].values)
351
+
352
+ data['DaysGreen'] = consec_green
353
+ data['DaysRed'] = consec_red
354
+
355
+ final_row = data.index[-2]
356
+
357
+ exp_row = data.index[-1]
358
+
359
+ df_final = data.loc[:final_row,
360
+ [
361
+ 'BigNewsDay',
362
+ 'Quarter',
363
+ 'Perf5Day',
364
+ 'Perf5Day_n1',
365
+ 'DaysGreen',
366
+ 'DaysRed',
367
+ 'CurrentHigh30toClose',
368
+ 'CurrentLow30toClose',
369
+ 'CurrentClose30toClose',
370
+ 'CurrentRange30',
371
+ 'GapFill30',
372
+ # 'OHLC4_Trend',
373
+ # 'OHLC4_Trend_n1',
374
+ # 'OHLC4_Trend_n2',
375
+ # 'VIX5Day',
376
+ # 'VIX5Day_n1',
377
+ 'CurrentGap',
378
+ 'RangePct',
379
+ 'RangePct_n1',
380
+ 'RangePct_n2',
381
+ 'OHLC4_VIX',
382
+ 'OHLC4_VIX_n1',
383
+ 'OHLC4_VIX_n2',
384
+ 'Target',
385
+ 'Target_clf'
386
+ ]]
387
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
388
  return data, df_final, final_row