wnstnb commited on
Commit
6ab628c
ยท
1 Parent(s): e8fc579

tgs part 4

Browse files
Files changed (2) hide show
  1. app.py +4 -4
  2. model_intra.py +33 -93
app.py CHANGED
@@ -164,9 +164,9 @@ with st.form("choose_model"):
164
 
165
  with st.spinner("Training models..."):
166
  def train_models():
167
- res1, xgbr, seq2 = walk_forward_validation_seq(df_final.dropna(), 'Target_clf', 'Target', 100, 1)
168
- return res1, xgbr, seq2
169
- res1, xgbr, seq2 = train_models()
170
  # st.success("โœ… Models trained")
171
 
172
  with st.spinner("Getting new prediction..."):
@@ -212,7 +212,7 @@ with st.form("choose_model"):
212
  new_pred['H2BreakPct'] = new_pred['H2BreakPct'].astype(float)
213
  new_pred['GreenProbas'] = new_pred['GreenProbas'].astype(float)
214
 
215
- seq_proba = seq_predict_proba(new_pred, xgbr, seq2)
216
 
217
  st.info(f'as of {option} on {curr_date} ๐Ÿ‘‡๐Ÿฝ', icon="๐Ÿ”ฎ")
218
 
 
164
 
165
  with st.spinner("Training models..."):
166
  def train_models():
167
+ res1, xgbr = walk_forward_validation(df_final.dropna(), 'Target_clf', 100, 1)
168
+ return res1, xgbr
169
+ res1, xgbr = train_models()
170
  # st.success("โœ… Models trained")
171
 
172
  with st.spinner("Getting new prediction..."):
 
212
  new_pred['H2BreakPct'] = new_pred['H2BreakPct'].astype(float)
213
  new_pred['GreenProbas'] = new_pred['GreenProbas'].astype(float)
214
 
215
+ seq_proba = seq_predict_proba(new_pred, xgbr)
216
 
217
  st.info(f'as of {option} on {curr_date} ๐Ÿ‘‡๐Ÿฝ', icon="๐Ÿ”ฎ")
218
 
model_intra.py CHANGED
@@ -3,53 +3,16 @@ import pandas as pd
3
  import pandas_datareader as pdr
4
  import numpy as np
5
  import yfinance as yf
6
- import json
7
  import requests
8
  from bs4 import BeautifulSoup
9
  from typing import List
10
- import xgboost as xgb
11
  from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
  import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
  import datetime
17
  from pandas.tseries.offsets import BDay
18
  from datasets import load_dataset
19
  import lightgbm as lgb
20
 
21
- # If the dataset is gated/private, make sure you have run huggingface-cli login
22
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
23
-
24
- # Create an XGBRegressor model
25
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
26
- model = linear_model.LinearRegression()
27
-
28
- overall_results = []
29
- # Iterate over the rows in the DataFrame, one step at a time
30
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
31
- # Split the data into training and test sets
32
- X_train = df.drop(target_column, axis=1).iloc[:i]
33
- y_train = df[target_column].iloc[:i]
34
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
35
- y_test = df[target_column].iloc[i:i+num_periods]
36
-
37
- # Fit the model to the training data
38
- model.fit(X_train, y_train)
39
-
40
- # Make a prediction on the test data
41
- predictions = model.predict(X_test)
42
-
43
- # Create a DataFrame to store the true and predicted values
44
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
45
-
46
- overall_results.append(result_df)
47
-
48
- df_results = pd.concat(overall_results)
49
- # model.save_model('model_lr.bin')
50
- # Return the true and predicted values, and fitted model
51
- return df_results, model
52
-
53
  model_cols = [
54
  'BigNewsDay',
55
  'Quarter',
@@ -85,46 +48,32 @@ model_cols = [
85
  # 'GapFillGreenProba'
86
  ]
87
 
88
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
89
-
90
- # Create run the regression model to get its target
91
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
92
- # joblib.dump(model1, 'model1.bin')
93
-
94
- # Merge the result df back on the df for feeding into the classifier
95
- for_merge = res[['Predicted']]
96
- for_merge.columns = ['RegrModelOut']
97
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
98
- df = df.merge(for_merge, left_index=True, right_index=True)
99
- df = df.drop(columns=[target_column_regr])
100
- df = df[model_cols + ['RegrModelOut', target_column_clf]]
101
 
102
- df[target_column_clf] = df[target_column_clf].astype(bool)
103
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
 
 
 
104
 
105
- # Create an XGBRegressor model
106
- # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
107
- model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
108
- # model = linear_model.LogisticRegression(max_iter=1500)
109
-
110
  overall_results = []
111
  # Iterate over the rows in the DataFrame, one step at a time
112
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
113
  # Split the data into training and test sets
114
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
115
- y_train = df[target_column_clf].iloc[:i]
116
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
117
- y_test = df[target_column_clf].iloc[i:i+num_periods]
118
 
119
  # Fit the model to the training data
120
- model2.fit(X_train, y_train)
121
 
122
  # Make a prediction on the test data
123
- predictions = model2.predict_proba(X_test)[:,-1]
124
 
125
  # Create a DataFrame to store the true and predicted values
126
  result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
127
-
128
  overall_results.append(result_df)
129
 
130
  df_results = pd.concat(overall_results)
@@ -134,32 +83,23 @@ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_t
134
  return df.groupby(pd.cut(df[col_name], q))['True'].mean()
135
 
136
  greenprobas = []
137
- meanprobas = []
138
  for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
139
  try:
140
  df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
141
  for q in df_q.index:
142
  if q.left <= pct <= q.right:
143
  p = df_q[q]
144
- c = (q.left + q.right) / 2
145
  except:
146
  p = None
147
- c = None
148
 
149
  greenprobas.append(p)
150
- meanprobas.append(c)
151
 
152
  df_results['CalibPredicted'] = greenprobas
153
 
154
- return df_results, model1, model2
155
-
156
 
157
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
158
- regr_pred = trained_reg_model.predict(df)
159
- regr_pred = regr_pred > 0
160
- new_df = df.copy()
161
- new_df['RegrModelOut'] = regr_pred
162
- clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
163
  return clf_pred_proba
164
 
165
  def get_data(periods_30m = 1):
@@ -298,18 +238,18 @@ def get_data(periods_30m = 1):
298
  # Rename the columns
299
  df_30m = df_30m[['Open','High','Low','Close']]
300
 
301
- opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
302
- highs_1h = df_30m.groupby('Datetime')['High'].max()
303
- lows_1h = df_30m.groupby('Datetime')['Low'].min()
304
- closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
305
 
306
- df_1h = pd.DataFrame(index=df_30m.index.unique())
307
- df_1h['Open'] = opens_1h
308
- df_1h['High'] = highs_1h
309
- df_1h['Low'] = lows_1h
310
- df_1h['Close'] = closes_1h
311
 
312
- df_1h.columns = ['Open30','High30','Low30','Close30']
313
 
314
  prices_vix = vix.history(start='2018-07-01', interval='1d')
315
  prices_spx = spx.history(start='2018-07-01', interval='1d')
@@ -327,7 +267,7 @@ def get_data(periods_30m = 1):
327
  prices_vix.index = pd.DatetimeIndex(prices_vix.index)
328
 
329
 
330
- data = prices_spx.merge(df_1h, left_index=True, right_index=True)
331
  data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
332
 
333
  # Features
@@ -417,10 +357,10 @@ def get_data(periods_30m = 1):
417
  OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
418
  OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
419
  OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
420
- CloseL1 = lambda x: np.where(x['Close'] < x['L1'], 1, 0),
421
- CloseL2 = lambda x: np.where(x['Close'] < x['L2'], 1, 0),
422
- CloseH1 = lambda x: np.where(x['Close'] > x['H1'], 1, 0),
423
- CloseH2 = lambda x: np.where(x['Close'] > x['H2'], 1, 0)
424
  )
425
 
426
  data['OpenL1'] = data['OpenL1'].shift(-1)
@@ -445,7 +385,7 @@ def get_data(periods_30m = 1):
445
 
446
  for col in level_cols:
447
  data[col+'Pct'] = data[col].rolling(100).mean()
448
- data[col+'Pct'] = data[col+'Pct'].shift(-1)
449
 
450
 
451
  def get_quintiles(df, col_name, q):
 
3
  import pandas_datareader as pdr
4
  import numpy as np
5
  import yfinance as yf
 
6
  import requests
7
  from bs4 import BeautifulSoup
8
  from typing import List
 
9
  from tqdm import tqdm
 
 
10
  import os
 
11
  import datetime
12
  from pandas.tseries.offsets import BDay
13
  from datasets import load_dataset
14
  import lightgbm as lgb
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  model_cols = [
17
  'BigNewsDay',
18
  'Quarter',
 
48
  # 'GapFillGreenProba'
49
  ]
50
 
51
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
52
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ df = df[model_cols + [target_column]]
55
+ df[target_column] = df[target_column].astype(bool)
56
+
57
+ # Model
58
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
59
 
 
 
 
 
 
60
  overall_results = []
61
  # Iterate over the rows in the DataFrame, one step at a time
62
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LGB Model'):
63
  # Split the data into training and test sets
64
+ X_train = df.drop(target_column, axis=1).iloc[:i]
65
+ y_train = df[target_column].iloc[:i]
66
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
67
+ y_test = df[target_column].iloc[i:i+num_periods]
68
 
69
  # Fit the model to the training data
70
+ model.fit(X_train, y_train)
71
 
72
  # Make a prediction on the test data
73
+ predictions = model.predict_proba(X_test)[:,-1]
74
 
75
  # Create a DataFrame to store the true and predicted values
76
  result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
 
77
  overall_results.append(result_df)
78
 
79
  df_results = pd.concat(overall_results)
 
83
  return df.groupby(pd.cut(df[col_name], q))['True'].mean()
84
 
85
  greenprobas = []
 
86
  for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
87
  try:
88
  df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
89
  for q in df_q.index:
90
  if q.left <= pct <= q.right:
91
  p = df_q[q]
 
92
  except:
93
  p = None
 
94
 
95
  greenprobas.append(p)
 
96
 
97
  df_results['CalibPredicted'] = greenprobas
98
 
99
+ return df_results, model
 
100
 
101
+ def seq_predict_proba(df, trained_clf_model):
102
+ clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
 
 
 
 
103
  return clf_pred_proba
104
 
105
  def get_data(periods_30m = 1):
 
238
  # Rename the columns
239
  df_30m = df_30m[['Open','High','Low','Close']]
240
 
241
+ opens_intra = df_30m.groupby('Datetime')['Open'].head(1)
242
+ highs_intra = df_30m.groupby('Datetime')['High'].max()
243
+ lows_intra = df_30m.groupby('Datetime')['Low'].min()
244
+ closes_intra = df_30m.groupby('Datetime')['Close'].tail(1)
245
 
246
+ df_intra = pd.DataFrame(index=df_30m.index.unique())
247
+ df_intra['Open'] = opens_intra
248
+ df_intra['High'] = highs_intra
249
+ df_intra['Low'] = lows_intra
250
+ df_intra['Close'] = closes_intra
251
 
252
+ df_intra.columns = ['Open30','High30','Low30','Close30']
253
 
254
  prices_vix = vix.history(start='2018-07-01', interval='1d')
255
  prices_spx = spx.history(start='2018-07-01', interval='1d')
 
267
  prices_vix.index = pd.DatetimeIndex(prices_vix.index)
268
 
269
 
270
+ data = prices_spx.merge(df_intra, left_index=True, right_index=True)
271
  data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
272
 
273
  # Features
 
357
  OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
358
  OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
359
  OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
360
+ CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
361
+ CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
362
+ CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
363
+ CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
364
  )
365
 
366
  data['OpenL1'] = data['OpenL1'].shift(-1)
 
385
 
386
  for col in level_cols:
387
  data[col+'Pct'] = data[col].rolling(100).mean()
388
+ # data[col+'Pct'] = data[col+'Pct'].shift(-1)
389
 
390
 
391
  def get_quintiles(df, col_name, q):