Winston B commited on
Commit
0921e55
·
1 Parent(s): 9a914c0

Create model_90m.py

Browse files
Files changed (1) hide show
  1. model_90m.py +399 -0
model_90m.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+
20
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
21
+
22
+ # Create an XGBRegressor model
23
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
24
+ model = linear_model.LinearRegression()
25
+
26
+ overall_results = []
27
+ # Iterate over the rows in the DataFrame, one step at a time
28
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
29
+ # Split the data into training and test sets
30
+ X_train = df.drop(target_column, axis=1).iloc[:i]
31
+ y_train = df[target_column].iloc[:i]
32
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
33
+ y_test = df[target_column].iloc[i:i+num_periods]
34
+
35
+ # Fit the model to the training data
36
+ model.fit(X_train, y_train)
37
+
38
+ # Make a prediction on the test data
39
+ predictions = model.predict(X_test)
40
+
41
+ # Create a DataFrame to store the true and predicted values
42
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
43
+
44
+ overall_results.append(result_df)
45
+
46
+ df_results = pd.concat(overall_results)
47
+ # model.save_model('model_lr.bin')
48
+ # Return the true and predicted values, and fitted model
49
+ return df_results, model
50
+
51
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
52
+
53
+ # Create run the regression model to get its target
54
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
55
+ # joblib.dump(model1, 'model1.bin')
56
+
57
+ # Merge the result df back on the df for feeding into the classifier
58
+ for_merge = res[['Predicted']]
59
+ for_merge.columns = ['RegrModelOut']
60
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
61
+ df = df.merge(for_merge, left_index=True, right_index=True)
62
+ df = df.drop(columns=[target_column_regr])
63
+ df = df[[
64
+ 'CurrentGap','RegrModelOut',
65
+ 'CurrentHigh30toClose',
66
+ 'CurrentLow30toClose',
67
+ 'CurrentClose30toClose',
68
+ 'CurrentRange30',
69
+ 'GapFill30',target_column_clf
70
+ ]]
71
+
72
+ df[target_column_clf] = df[target_column_clf].astype(bool)
73
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
74
+
75
+ # Create an XGBRegressor model
76
+ model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
77
+ # model = linear_model.LogisticRegression(max_iter=1500)
78
+
79
+ overall_results = []
80
+ # Iterate over the rows in the DataFrame, one step at a time
81
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
82
+ # Split the data into training and test sets
83
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
84
+ y_train = df[target_column_clf].iloc[:i]
85
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
86
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
87
+
88
+ # Fit the model to the training data
89
+ model2.fit(X_train, y_train)
90
+
91
+ # Make a prediction on the test data
92
+ predictions = model2.predict_proba(X_test)[:,-1]
93
+
94
+ # Create a DataFrame to store the true and predicted values
95
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
96
+
97
+ overall_results.append(result_df)
98
+
99
+ df_results = pd.concat(overall_results)
100
+ # model1.save_model('model_ensemble.bin')
101
+ # joblib.dump(model2, 'model2.bin')
102
+ # Return the true and predicted values, and fitted model
103
+ return df_results, model1, model2
104
+
105
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
106
+ regr_pred = trained_reg_model.predict(df)
107
+ regr_pred = regr_pred > 0
108
+ new_df = df.copy()
109
+ new_df['RegrModelOut'] = regr_pred
110
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[['CurrentGap','RegrModelOut',
111
+ 'CurrentHigh30toClose',
112
+ 'CurrentLow30toClose',
113
+ 'CurrentClose30toClose',
114
+ 'CurrentRange30',
115
+ 'GapFill30']])[:,-1]
116
+ return clf_pred_proba
117
+
118
+ def get_data():
119
+ # f = open('settings.json')
120
+ # j = json.load(f)
121
+ # API_KEY_FRED = j["API_KEY_FRED"]
122
+
123
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
124
+
125
+ def parse_release_dates(release_id: str) -> List[str]:
126
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
127
+ r = requests.get(release_dates_url)
128
+ text = r.text
129
+ soup = BeautifulSoup(text, 'xml')
130
+ dates = []
131
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
132
+ dates.append(release_date_tag.text)
133
+ return dates
134
+
135
+ def parse_release_dates_obs(series_id: str) -> List[str]:
136
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
137
+ r = requests.get(obs_url)
138
+ text = r.text
139
+ soup = BeautifulSoup(text, 'xml')
140
+ observations = []
141
+ for observation_tag in soup.find_all('observation'):
142
+ date = observation_tag.get('date')
143
+ value = observation_tag.get('value')
144
+ observations.append((date, value))
145
+ return observations
146
+
147
+ econ_dfs = {}
148
+
149
+ econ_tickers = [
150
+ 'WALCL',
151
+ 'NFCI',
152
+ 'WRESBAL'
153
+ ]
154
+
155
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
156
+ # p = parse_release_dates_obs(et)
157
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
158
+ df = pdr.get_data_fred(et)
159
+ df.index = df.index.rename('ds')
160
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
161
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
162
+ econ_dfs[et] = df
163
+
164
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
165
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
166
+
167
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
168
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
169
+
170
+ release_ids = [
171
+ "10", # "Consumer Price Index"
172
+ "46", # "Producer Price Index"
173
+ "50", # "Employment Situation"
174
+ "53", # "Gross Domestic Product"
175
+ "103", # "Discount Rate Meeting Minutes"
176
+ "180", # "Unemployment Insurance Weekly Claims Report"
177
+ "194", # "ADP National Employment Report"
178
+ "323" # "Trimmed Mean PCE Inflation Rate"
179
+ ]
180
+
181
+ release_names = [
182
+ "CPI",
183
+ "PPI",
184
+ "NFP",
185
+ "GDP",
186
+ "FOMC",
187
+ "UNEMP",
188
+ "ADP",
189
+ "PCE"
190
+ ]
191
+
192
+ releases = {}
193
+
194
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
195
+ releases[rid] = {}
196
+ releases[rid]['dates'] = parse_release_dates(rid)
197
+ releases[rid]['name'] = n
198
+
199
+ # Create a DF that has all dates with the name of the col as 1
200
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
201
+ # This column serves as the true/false indicator of whether there was economic data released that day.
202
+ for rid in tqdm(release_ids, desc='Making indicators'):
203
+ releases[rid]['df'] = pd.DataFrame(
204
+ index=releases[rid]['dates'],
205
+ data={
206
+ releases[rid]['name']: 1
207
+ })
208
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
209
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
210
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
211
+
212
+ vix = yf.Ticker('^VIX')
213
+ spx = yf.Ticker('^GSPC')
214
+
215
+
216
+ # Pull in data
217
+ data = load_dataset("boomsss/SPX_full_30min", split='train')
218
+
219
+ rows = [d['text'] for d in data]
220
+ rows = [x.split(',') for x in rows]
221
+
222
+ fr = pd.DataFrame(columns=[
223
+ 'Datetime','Open','High','Low','Close'
224
+ ], data = rows)
225
+
226
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
227
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
228
+ fr = fr.set_index('Datetime')
229
+ fr['Open'] = pd.to_numeric(fr['Open'])
230
+ fr['High'] = pd.to_numeric(fr['High'])
231
+ fr['Low'] = pd.to_numeric(fr['Low'])
232
+ fr['Close'] = pd.to_numeric(fr['Close'])
233
+
234
+ # Get incremental date
235
+ last_date = fr.index.date[-1]
236
+ last_date = last_date + datetime.timedelta(days=1)
237
+ # Get incremental data
238
+ spx1 = yf.Ticker('^GSPC')
239
+ yfp = spx1.history(start=last_date, interval='90m')
240
+ # Concat current and incremental
241
+ df_30m = pd.concat([fr, yfp])
242
+ # Get the first 30 minute bar
243
+ df_30m = df_30m.reset_index()
244
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
245
+ df_30m = df_30m.groupby('Datetime').head(3)
246
+ df_30m = df_30m.set_index('Datetime',drop=True)
247
+ # Rename the columns
248
+ df_30m = df_30m[['Open','High','Low','Close']]
249
+
250
+ opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
251
+ closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
252
+ highs_1h = df_30m.groupby('Datetime')['High'].max()
253
+ lows_1h = df_30m.groupby('Datetime')['Low'].min()
254
+
255
+ df_1h = pd.DataFrame(index=df_30m.index.unique())
256
+ df_1h['Open'] = opens_1h
257
+ df_1h['Close'] = closes_1h
258
+ df_1h['High'] = highs_1h
259
+ df_1h['Low'] = lows_1h
260
+
261
+ df_1h.columns = ['Open30','High30','Low30','Close30']
262
+
263
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
264
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
265
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
266
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
267
+ prices_spx.index = prices_spx['index']
268
+ prices_spx = prices_spx.drop(columns='index')
269
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
270
+
271
+
272
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
273
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
274
+ prices_vix.index = prices_vix['index']
275
+ prices_vix = prices_vix.drop(columns='index')
276
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
277
+
278
+
279
+ data = prices_spx.merge(df_1h, left_index=True, right_index=True)
280
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
281
+
282
+ # Features
283
+ data['PrevClose'] = data['Close'].shift(1)
284
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
285
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
286
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
287
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
288
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
289
+
290
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
291
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
292
+
293
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
294
+ data['RangePct'] = data['Range'] / data['Close']
295
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
296
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
297
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
298
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
299
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
300
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
301
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
302
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
303
+ data['RangePct_n1'] = data['RangePct'].shift(1)
304
+ data['RangePct_n2'] = data['RangePct'].shift(2)
305
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
306
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
307
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
308
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
309
+ data['DayOfWeek'] = pd.to_datetime(data.index)
310
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
311
+
312
+ # Intraday features
313
+ data['CurrentHigh30'] = data['High30'].shift(-1)
314
+ data['CurrentLow30'] = data['Low30'].shift(-1)
315
+ data['CurrentClose30'] = data['Close30'].shift(-1)
316
+
317
+ # Open to High
318
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
319
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
320
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
321
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
322
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
323
+
324
+ # Target -- the next day's low
325
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
326
+ data['Target'] = data['Target'].shift(-1)
327
+ # data['Target'] = data['RangePct'].shift(-1)
328
+
329
+ # Target for clf -- whether tomorrow will close above or below today's close
330
+ data['Target_clf'] = data['Close'] > data['PrevClose']
331
+ data['Target_clf'] = data['Target_clf'].shift(-1)
332
+ data['DayOfWeek'] = pd.to_datetime(data.index)
333
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
334
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
335
+
336
+ for rid in tqdm(release_ids, desc='Merging econ data'):
337
+ # Get the name of the release
338
+ n = releases[rid]['name']
339
+ # Merge the corresponding DF of the release
340
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
341
+ # Create a column that shifts the value in the merged column up by 1
342
+ data[f'{n}_shift'] = data[n].shift(-1)
343
+ # Fill the rest with zeroes
344
+ data[n] = data[n].fillna(0)
345
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
346
+
347
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
348
+
349
+ def cumul_sum(col):
350
+ nums = []
351
+ s = 0
352
+ for x in col:
353
+ if x == 1:
354
+ s += 1
355
+ elif x == 0:
356
+ s = 0
357
+ nums.append(s)
358
+ return nums
359
+
360
+ consec_green = cumul_sum(data['GreenDay'].values)
361
+ consec_red = cumul_sum(data['RedDay'].values)
362
+
363
+ data['DaysGreen'] = consec_green
364
+ data['DaysRed'] = consec_red
365
+
366
+ final_row = data.index[-2]
367
+
368
+ exp_row = data.index[-1]
369
+
370
+ df_final = data.loc[:final_row,
371
+ [
372
+ 'BigNewsDay',
373
+ 'Quarter',
374
+ 'Perf5Day',
375
+ 'Perf5Day_n1',
376
+ 'DaysGreen',
377
+ 'DaysRed',
378
+ 'CurrentHigh30toClose',
379
+ 'CurrentLow30toClose',
380
+ 'CurrentClose30toClose',
381
+ 'CurrentRange30',
382
+ 'GapFill30',
383
+ # 'OHLC4_Trend',
384
+ # 'OHLC4_Trend_n1',
385
+ # 'OHLC4_Trend_n2',
386
+ # 'VIX5Day',
387
+ # 'VIX5Day_n1',
388
+ 'CurrentGap',
389
+ 'RangePct',
390
+ 'RangePct_n1',
391
+ 'RangePct_n2',
392
+ 'OHLC4_VIX',
393
+ 'OHLC4_VIX_n1',
394
+ 'OHLC4_VIX_n2',
395
+ 'Target',
396
+ 'Target_clf'
397
+ ]]
398
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
399
+ return data, df_final, final_row