Winston B commited on
Commit
6281bbb
·
1 Parent(s): 9fe24eb

Grab rid 101 day

Browse files
Files changed (1) hide show
  1. model_day.py +324 -322
model_day.py CHANGED
@@ -1,323 +1,325 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
-
20
- # If the dataset is gated/private, make sure you have run huggingface-cli login
21
- dataset = load_dataset("boomsss/SPX_full_30min", split="train")
22
-
23
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
24
-
25
- # Create an XGBRegressor model
26
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
27
- model = linear_model.LinearRegression()
28
-
29
- overall_results = []
30
- # Iterate over the rows in the DataFrame, one step at a time
31
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
32
- # Split the data into training and test sets
33
- X_train = df.drop(target_column, axis=1).iloc[:i]
34
- y_train = df[target_column].iloc[:i]
35
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
36
- y_test = df[target_column].iloc[i:i+num_periods]
37
-
38
- # Fit the model to the training data
39
- model.fit(X_train, y_train)
40
-
41
- # Make a prediction on the test data
42
- predictions = model.predict(X_test)
43
-
44
- # Create a DataFrame to store the true and predicted values
45
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
46
-
47
- overall_results.append(result_df)
48
-
49
- df_results = pd.concat(overall_results)
50
- # model.save_model('model_lr.bin')
51
- # Return the true and predicted values, and fitted model
52
- return df_results, model
53
-
54
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
55
-
56
- # Create run the regression model to get its target
57
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
58
- # joblib.dump(model1, 'model1.bin')
59
-
60
- # Merge the result df back on the df for feeding into the classifier
61
- for_merge = res[['Predicted']]
62
- for_merge.columns = ['RegrModelOut']
63
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
64
- df = df.merge(for_merge, left_index=True, right_index=True)
65
- df = df.drop(columns=[target_column_regr])
66
- df = df[[
67
- 'CurrentGap','RegrModelOut',target_column_clf
68
- ]]
69
-
70
- df[target_column_clf] = df[target_column_clf].astype(bool)
71
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
72
-
73
- # Create an XGBRegressor model
74
- model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
75
- # model = linear_model.LogisticRegression(max_iter=1500)
76
-
77
- overall_results = []
78
- # Iterate over the rows in the DataFrame, one step at a time
79
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
80
- # Split the data into training and test sets
81
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
82
- y_train = df[target_column_clf].iloc[:i]
83
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
84
- y_test = df[target_column_clf].iloc[i:i+num_periods]
85
-
86
- # Fit the model to the training data
87
- model2.fit(X_train, y_train)
88
-
89
- # Make a prediction on the test data
90
- predictions = model2.predict_proba(X_test)[:,-1]
91
-
92
- # Create a DataFrame to store the true and predicted values
93
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
94
-
95
- overall_results.append(result_df)
96
-
97
- df_results = pd.concat(overall_results)
98
- # model1.save_model('model_ensemble.bin')
99
- # joblib.dump(model2, 'model2.bin')
100
- # Return the true and predicted values, and fitted model
101
- return df_results, model1, model2
102
-
103
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
- regr_pred = trained_reg_model.predict(df)
105
- regr_pred = regr_pred > 0
106
- new_df = df.copy()
107
- new_df['RegrModelOut'] = regr_pred
108
- clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
109
- return clf_pred_proba
110
-
111
- def get_data():
112
- # f = open('settings.json')
113
- # j = json.load(f)
114
- # API_KEY_FRED = j["API_KEY_FRED"]
115
-
116
- API_KEY_FRED = os.getenv('API_KEY_FRED')
117
-
118
- def parse_release_dates(release_id: str) -> List[str]:
119
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
120
- r = requests.get(release_dates_url)
121
- text = r.text
122
- soup = BeautifulSoup(text, 'xml')
123
- dates = []
124
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
125
- dates.append(release_date_tag.text)
126
- return dates
127
-
128
- def parse_release_dates_obs(series_id: str) -> List[str]:
129
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
130
- r = requests.get(obs_url)
131
- text = r.text
132
- soup = BeautifulSoup(text, 'xml')
133
- observations = []
134
- for observation_tag in soup.find_all('observation'):
135
- date = observation_tag.get('date')
136
- value = observation_tag.get('value')
137
- observations.append((date, value))
138
- return observations
139
-
140
- econ_dfs = {}
141
-
142
- econ_tickers = [
143
- 'WALCL',
144
- 'NFCI',
145
- 'WRESBAL'
146
- ]
147
-
148
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
149
- # p = parse_release_dates_obs(et)
150
- # df = pd.DataFrame(columns = ['ds',et], data = p)
151
- df = pdr.get_data_fred(et)
152
- df.index = df.index.rename('ds')
153
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
154
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
155
- econ_dfs[et] = df
156
-
157
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
158
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
159
-
160
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
161
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
162
-
163
- release_ids = [
164
- "10", # "Consumer Price Index"
165
- "46", # "Producer Price Index"
166
- "50", # "Employment Situation"
167
- "53", # "Gross Domestic Product"
168
- "103", # "Discount Rate Meeting Minutes"
169
- "180", # "Unemployment Insurance Weekly Claims Report"
170
- "194", # "ADP National Employment Report"
171
- "323" # "Trimmed Mean PCE Inflation Rate"
172
- ]
173
-
174
- release_names = [
175
- "CPI",
176
- "PPI",
177
- "NFP",
178
- "GDP",
179
- "FOMC",
180
- "UNEMP",
181
- "ADP",
182
- "PCE"
183
- ]
184
-
185
- releases = {}
186
-
187
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
188
- releases[rid] = {}
189
- releases[rid]['dates'] = parse_release_dates(rid)
190
- releases[rid]['name'] = n
191
-
192
- # Create a DF that has all dates with the name of the col as 1
193
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
194
- # This column serves as the true/false indicator of whether there was economic data released that day.
195
- for rid in tqdm(release_ids, desc='Making indicators'):
196
- releases[rid]['df'] = pd.DataFrame(
197
- index=releases[rid]['dates'],
198
- data={
199
- releases[rid]['name']: 1
200
- })
201
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
202
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
203
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
204
-
205
- vix = yf.Ticker('^VIX')
206
- spx = yf.Ticker('^GSPC')
207
-
208
- prices_vix = vix.history(start='2018-07-01', interval='1d')
209
- prices_spx = spx.history(start='2018-07-01', interval='1d')
210
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
211
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
212
- prices_spx.index = prices_spx['index']
213
- prices_spx = prices_spx.drop(columns='index')
214
-
215
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
216
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
217
- prices_vix.index = prices_vix['index']
218
- prices_vix = prices_vix.drop(columns='index')
219
-
220
- data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
221
- data.index = pd.DatetimeIndex(data.index)
222
-
223
- # Features
224
- data['PrevClose'] = data['Close'].shift(1)
225
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
226
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
227
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
228
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
229
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
230
-
231
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
232
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
233
-
234
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
235
- data['RangePct'] = data['Range'] / data['Close']
236
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
237
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
238
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
239
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
240
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
241
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
242
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
243
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
244
- data['RangePct_n1'] = data['RangePct'].shift(1)
245
- data['RangePct_n2'] = data['RangePct'].shift(2)
246
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
247
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
248
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
249
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
250
- data['DayOfWeek'] = pd.to_datetime(data.index)
251
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
252
-
253
- # Target -- the next day's low
254
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
255
- data['Target'] = data['Target'].shift(-1)
256
- # data['Target'] = data['RangePct'].shift(-1)
257
-
258
- # Target for clf -- whether tomorrow will close above or below today's close
259
- data['Target_clf'] = data['Close'] > data['PrevClose']
260
- data['Target_clf'] = data['Target_clf'].shift(-1)
261
- data['DayOfWeek'] = pd.to_datetime(data.index)
262
- data['Quarter'] = data['DayOfWeek'].dt.quarter
263
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
264
-
265
- for rid in tqdm(release_ids, desc='Merging econ data'):
266
- # Get the name of the release
267
- n = releases[rid]['name']
268
- # Merge the corresponding DF of the release
269
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
270
- # Create a column that shifts the value in the merged column up by 1
271
- data[f'{n}_shift'] = data[n].shift(-1)
272
- # Fill the rest with zeroes
273
- data[n] = data[n].fillna(0)
274
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
275
-
276
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
277
-
278
- def cumul_sum(col):
279
- nums = []
280
- s = 0
281
- for x in col:
282
- if x == 1:
283
- s += 1
284
- elif x == 0:
285
- s = 0
286
- nums.append(s)
287
- return nums
288
-
289
- consec_green = cumul_sum(data['GreenDay'].values)
290
- consec_red = cumul_sum(data['RedDay'].values)
291
-
292
- data['DaysGreen'] = consec_green
293
- data['DaysRed'] = consec_red
294
-
295
- final_row = data.index[-2]
296
-
297
- exp_row = data.index[-1]
298
-
299
- df_final = data.loc[:final_row,
300
- [
301
- 'BigNewsDay',
302
- 'Quarter',
303
- 'Perf5Day',
304
- 'Perf5Day_n1',
305
- 'DaysGreen',
306
- 'DaysRed',
307
- # 'OHLC4_Trend',
308
- # 'OHLC4_Trend_n1',
309
- # 'OHLC4_Trend_n2',
310
- # 'VIX5Day',
311
- # 'VIX5Day_n1',
312
- 'CurrentGap',
313
- 'RangePct',
314
- 'RangePct_n1',
315
- 'RangePct_n2',
316
- 'OHLC4_VIX',
317
- 'OHLC4_VIX_n1',
318
- 'OHLC4_VIX_n2',
319
- 'Target',
320
- 'Target_clf'
321
- ]]
322
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
 
 
323
  return data, df_final, final_row
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
21
+ dataset = load_dataset("boomsss/SPX_full_30min", split="train")
22
+
23
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
24
+
25
+ # Create an XGBRegressor model
26
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
27
+ model = linear_model.LinearRegression()
28
+
29
+ overall_results = []
30
+ # Iterate over the rows in the DataFrame, one step at a time
31
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
32
+ # Split the data into training and test sets
33
+ X_train = df.drop(target_column, axis=1).iloc[:i]
34
+ y_train = df[target_column].iloc[:i]
35
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
36
+ y_test = df[target_column].iloc[i:i+num_periods]
37
+
38
+ # Fit the model to the training data
39
+ model.fit(X_train, y_train)
40
+
41
+ # Make a prediction on the test data
42
+ predictions = model.predict(X_test)
43
+
44
+ # Create a DataFrame to store the true and predicted values
45
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
46
+
47
+ overall_results.append(result_df)
48
+
49
+ df_results = pd.concat(overall_results)
50
+ # model.save_model('model_lr.bin')
51
+ # Return the true and predicted values, and fitted model
52
+ return df_results, model
53
+
54
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
55
+
56
+ # Create run the regression model to get its target
57
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
58
+ # joblib.dump(model1, 'model1.bin')
59
+
60
+ # Merge the result df back on the df for feeding into the classifier
61
+ for_merge = res[['Predicted']]
62
+ for_merge.columns = ['RegrModelOut']
63
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
64
+ df = df.merge(for_merge, left_index=True, right_index=True)
65
+ df = df.drop(columns=[target_column_regr])
66
+ df = df[[
67
+ 'CurrentGap','RegrModelOut',target_column_clf
68
+ ]]
69
+
70
+ df[target_column_clf] = df[target_column_clf].astype(bool)
71
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
72
+
73
+ # Create an XGBRegressor model
74
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
75
+ # model = linear_model.LogisticRegression(max_iter=1500)
76
+
77
+ overall_results = []
78
+ # Iterate over the rows in the DataFrame, one step at a time
79
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
80
+ # Split the data into training and test sets
81
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
82
+ y_train = df[target_column_clf].iloc[:i]
83
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
84
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
85
+
86
+ # Fit the model to the training data
87
+ model2.fit(X_train, y_train)
88
+
89
+ # Make a prediction on the test data
90
+ predictions = model2.predict_proba(X_test)[:,-1]
91
+
92
+ # Create a DataFrame to store the true and predicted values
93
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
94
+
95
+ overall_results.append(result_df)
96
+
97
+ df_results = pd.concat(overall_results)
98
+ # model1.save_model('model_ensemble.bin')
99
+ # joblib.dump(model2, 'model2.bin')
100
+ # Return the true and predicted values, and fitted model
101
+ return df_results, model1, model2
102
+
103
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
104
+ regr_pred = trained_reg_model.predict(df)
105
+ regr_pred = regr_pred > 0
106
+ new_df = df.copy()
107
+ new_df['RegrModelOut'] = regr_pred
108
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut']])[:,-1]
109
+ return clf_pred_proba
110
+
111
+ def get_data():
112
+ # f = open('settings.json')
113
+ # j = json.load(f)
114
+ # API_KEY_FRED = j["API_KEY_FRED"]
115
+
116
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
117
+
118
+ def parse_release_dates(release_id: str) -> List[str]:
119
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
120
+ r = requests.get(release_dates_url)
121
+ text = r.text
122
+ soup = BeautifulSoup(text, 'xml')
123
+ dates = []
124
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
125
+ dates.append(release_date_tag.text)
126
+ return dates
127
+
128
+ def parse_release_dates_obs(series_id: str) -> List[str]:
129
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
130
+ r = requests.get(obs_url)
131
+ text = r.text
132
+ soup = BeautifulSoup(text, 'xml')
133
+ observations = []
134
+ for observation_tag in soup.find_all('observation'):
135
+ date = observation_tag.get('date')
136
+ value = observation_tag.get('value')
137
+ observations.append((date, value))
138
+ return observations
139
+
140
+ econ_dfs = {}
141
+
142
+ econ_tickers = [
143
+ 'WALCL',
144
+ 'NFCI',
145
+ 'WRESBAL'
146
+ ]
147
+
148
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
149
+ # p = parse_release_dates_obs(et)
150
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
151
+ df = pdr.get_data_fred(et)
152
+ df.index = df.index.rename('ds')
153
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
154
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
155
+ econ_dfs[et] = df
156
+
157
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
158
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
159
+
160
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
161
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
162
+
163
+ release_ids = [
164
+ "10", # "Consumer Price Index"
165
+ "46", # "Producer Price Index"
166
+ "50", # "Employment Situation"
167
+ "53", # "Gross Domestic Product"
168
+ "101", # "FOMC press release"
169
+ "103", # "Discount Rate Meeting Minutes"
170
+ "180", # "Unemployment Insurance Weekly Claims Report"
171
+ "194", # "ADP National Employment Report"
172
+ "323" # "Trimmed Mean PCE Inflation Rate"
173
+ ]
174
+
175
+ release_names = [
176
+ "CPI",
177
+ "PPI",
178
+ "NFP",
179
+ "GDP",
180
+ "FOMCPR",
181
+ "FOMC",
182
+ "UNEMP",
183
+ "ADP",
184
+ "PCE"
185
+ ]
186
+
187
+ releases = {}
188
+
189
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
190
+ releases[rid] = {}
191
+ releases[rid]['dates'] = parse_release_dates(rid)
192
+ releases[rid]['name'] = n
193
+
194
+ # Create a DF that has all dates with the name of the col as 1
195
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
196
+ # This column serves as the true/false indicator of whether there was economic data released that day.
197
+ for rid in tqdm(release_ids, desc='Making indicators'):
198
+ releases[rid]['df'] = pd.DataFrame(
199
+ index=releases[rid]['dates'],
200
+ data={
201
+ releases[rid]['name']: 1
202
+ })
203
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
204
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
205
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
206
+
207
+ vix = yf.Ticker('^VIX')
208
+ spx = yf.Ticker('^GSPC')
209
+
210
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
211
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
212
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
213
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
214
+ prices_spx.index = prices_spx['index']
215
+ prices_spx = prices_spx.drop(columns='index')
216
+
217
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
218
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
219
+ prices_vix.index = prices_vix['index']
220
+ prices_vix = prices_vix.drop(columns='index')
221
+
222
+ data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
223
+ data.index = pd.DatetimeIndex(data.index)
224
+
225
+ # Features
226
+ data['PrevClose'] = data['Close'].shift(1)
227
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
228
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
229
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
230
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
231
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
232
+
233
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
234
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
235
+
236
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
237
+ data['RangePct'] = data['Range'] / data['Close']
238
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
239
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
240
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
241
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
242
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
243
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
244
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
245
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
246
+ data['RangePct_n1'] = data['RangePct'].shift(1)
247
+ data['RangePct_n2'] = data['RangePct'].shift(2)
248
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
249
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
250
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
251
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
252
+ data['DayOfWeek'] = pd.to_datetime(data.index)
253
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
254
+
255
+ # Target -- the next day's low
256
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
257
+ data['Target'] = data['Target'].shift(-1)
258
+ # data['Target'] = data['RangePct'].shift(-1)
259
+
260
+ # Target for clf -- whether tomorrow will close above or below today's close
261
+ data['Target_clf'] = data['Close'] > data['PrevClose']
262
+ data['Target_clf'] = data['Target_clf'].shift(-1)
263
+ data['DayOfWeek'] = pd.to_datetime(data.index)
264
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
265
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
266
+
267
+ for rid in tqdm(release_ids, desc='Merging econ data'):
268
+ # Get the name of the release
269
+ n = releases[rid]['name']
270
+ # Merge the corresponding DF of the release
271
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
272
+ # Create a column that shifts the value in the merged column up by 1
273
+ data[f'{n}_shift'] = data[n].shift(-1)
274
+ # Fill the rest with zeroes
275
+ data[n] = data[n].fillna(0)
276
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
277
+
278
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
279
+
280
+ def cumul_sum(col):
281
+ nums = []
282
+ s = 0
283
+ for x in col:
284
+ if x == 1:
285
+ s += 1
286
+ elif x == 0:
287
+ s = 0
288
+ nums.append(s)
289
+ return nums
290
+
291
+ consec_green = cumul_sum(data['GreenDay'].values)
292
+ consec_red = cumul_sum(data['RedDay'].values)
293
+
294
+ data['DaysGreen'] = consec_green
295
+ data['DaysRed'] = consec_red
296
+
297
+ final_row = data.index[-2]
298
+
299
+ exp_row = data.index[-1]
300
+
301
+ df_final = data.loc[:final_row,
302
+ [
303
+ 'BigNewsDay',
304
+ 'Quarter',
305
+ 'Perf5Day',
306
+ 'Perf5Day_n1',
307
+ 'DaysGreen',
308
+ 'DaysRed',
309
+ # 'OHLC4_Trend',
310
+ # 'OHLC4_Trend_n1',
311
+ # 'OHLC4_Trend_n2',
312
+ # 'VIX5Day',
313
+ # 'VIX5Day_n1',
314
+ 'CurrentGap',
315
+ 'RangePct',
316
+ 'RangePct_n1',
317
+ 'RangePct_n2',
318
+ 'OHLC4_VIX',
319
+ 'OHLC4_VIX_n1',
320
+ 'OHLC4_VIX_n2',
321
+ 'Target',
322
+ 'Target_clf'
323
+ ]]
324
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
325
  return data, df_final, final_row