Spaces:
Sleeping
Sleeping
tgs part 4
Browse files- app.py +4 -4
- model_intra.py +33 -93
app.py
CHANGED
@@ -164,9 +164,9 @@ with st.form("choose_model"):
|
|
164 |
|
165 |
with st.spinner("Training models..."):
|
166 |
def train_models():
|
167 |
-
res1, xgbr
|
168 |
-
return res1, xgbr
|
169 |
-
res1, xgbr
|
170 |
# st.success("โ
Models trained")
|
171 |
|
172 |
with st.spinner("Getting new prediction..."):
|
@@ -212,7 +212,7 @@ with st.form("choose_model"):
|
|
212 |
new_pred['H2BreakPct'] = new_pred['H2BreakPct'].astype(float)
|
213 |
new_pred['GreenProbas'] = new_pred['GreenProbas'].astype(float)
|
214 |
|
215 |
-
seq_proba = seq_predict_proba(new_pred, xgbr
|
216 |
|
217 |
st.info(f'as of {option} on {curr_date} ๐๐ฝ', icon="๐ฎ")
|
218 |
|
|
|
164 |
|
165 |
with st.spinner("Training models..."):
|
166 |
def train_models():
|
167 |
+
res1, xgbr = walk_forward_validation(df_final.dropna(), 'Target_clf', 100, 1)
|
168 |
+
return res1, xgbr
|
169 |
+
res1, xgbr = train_models()
|
170 |
# st.success("โ
Models trained")
|
171 |
|
172 |
with st.spinner("Getting new prediction..."):
|
|
|
212 |
new_pred['H2BreakPct'] = new_pred['H2BreakPct'].astype(float)
|
213 |
new_pred['GreenProbas'] = new_pred['GreenProbas'].astype(float)
|
214 |
|
215 |
+
seq_proba = seq_predict_proba(new_pred, xgbr)
|
216 |
|
217 |
st.info(f'as of {option} on {curr_date} ๐๐ฝ', icon="๐ฎ")
|
218 |
|
model_intra.py
CHANGED
@@ -3,53 +3,16 @@ import pandas as pd
|
|
3 |
import pandas_datareader as pdr
|
4 |
import numpy as np
|
5 |
import yfinance as yf
|
6 |
-
import json
|
7 |
import requests
|
8 |
from bs4 import BeautifulSoup
|
9 |
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
import os
|
15 |
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
import datetime
|
17 |
from pandas.tseries.offsets import BDay
|
18 |
from datasets import load_dataset
|
19 |
import lightgbm as lgb
|
20 |
|
21 |
-
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
22 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
23 |
-
|
24 |
-
# Create an XGBRegressor model
|
25 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
26 |
-
model = linear_model.LinearRegression()
|
27 |
-
|
28 |
-
overall_results = []
|
29 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
30 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
31 |
-
# Split the data into training and test sets
|
32 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
33 |
-
y_train = df[target_column].iloc[:i]
|
34 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
35 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
36 |
-
|
37 |
-
# Fit the model to the training data
|
38 |
-
model.fit(X_train, y_train)
|
39 |
-
|
40 |
-
# Make a prediction on the test data
|
41 |
-
predictions = model.predict(X_test)
|
42 |
-
|
43 |
-
# Create a DataFrame to store the true and predicted values
|
44 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
45 |
-
|
46 |
-
overall_results.append(result_df)
|
47 |
-
|
48 |
-
df_results = pd.concat(overall_results)
|
49 |
-
# model.save_model('model_lr.bin')
|
50 |
-
# Return the true and predicted values, and fitted model
|
51 |
-
return df_results, model
|
52 |
-
|
53 |
model_cols = [
|
54 |
'BigNewsDay',
|
55 |
'Quarter',
|
@@ -85,46 +48,32 @@ model_cols = [
|
|
85 |
# 'GapFillGreenProba'
|
86 |
]
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
# Create run the regression model to get its target
|
91 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
92 |
-
# joblib.dump(model1, 'model1.bin')
|
93 |
-
|
94 |
-
# Merge the result df back on the df for feeding into the classifier
|
95 |
-
for_merge = res[['Predicted']]
|
96 |
-
for_merge.columns = ['RegrModelOut']
|
97 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
98 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
99 |
-
df = df.drop(columns=[target_column_regr])
|
100 |
-
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
101 |
|
102 |
-
df
|
103 |
-
df[
|
|
|
|
|
|
|
104 |
|
105 |
-
# Create an XGBRegressor model
|
106 |
-
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
107 |
-
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
108 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
109 |
-
|
110 |
overall_results = []
|
111 |
# Iterate over the rows in the DataFrame, one step at a time
|
112 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'
|
113 |
# Split the data into training and test sets
|
114 |
-
X_train = df.drop(
|
115 |
-
y_train = df[
|
116 |
-
X_test = df.drop(
|
117 |
-
y_test = df[
|
118 |
|
119 |
# Fit the model to the training data
|
120 |
-
|
121 |
|
122 |
# Make a prediction on the test data
|
123 |
-
predictions =
|
124 |
|
125 |
# Create a DataFrame to store the true and predicted values
|
126 |
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
127 |
-
|
128 |
overall_results.append(result_df)
|
129 |
|
130 |
df_results = pd.concat(overall_results)
|
@@ -134,32 +83,23 @@ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_t
|
|
134 |
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
135 |
|
136 |
greenprobas = []
|
137 |
-
meanprobas = []
|
138 |
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
|
139 |
try:
|
140 |
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
141 |
for q in df_q.index:
|
142 |
if q.left <= pct <= q.right:
|
143 |
p = df_q[q]
|
144 |
-
c = (q.left + q.right) / 2
|
145 |
except:
|
146 |
p = None
|
147 |
-
c = None
|
148 |
|
149 |
greenprobas.append(p)
|
150 |
-
meanprobas.append(c)
|
151 |
|
152 |
df_results['CalibPredicted'] = greenprobas
|
153 |
|
154 |
-
return df_results,
|
155 |
-
|
156 |
|
157 |
-
def seq_predict_proba(df,
|
158 |
-
|
159 |
-
regr_pred = regr_pred > 0
|
160 |
-
new_df = df.copy()
|
161 |
-
new_df['RegrModelOut'] = regr_pred
|
162 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
163 |
return clf_pred_proba
|
164 |
|
165 |
def get_data(periods_30m = 1):
|
@@ -298,18 +238,18 @@ def get_data(periods_30m = 1):
|
|
298 |
# Rename the columns
|
299 |
df_30m = df_30m[['Open','High','Low','Close']]
|
300 |
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
|
312 |
-
|
313 |
|
314 |
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
315 |
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
@@ -327,7 +267,7 @@ def get_data(periods_30m = 1):
|
|
327 |
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
328 |
|
329 |
|
330 |
-
data = prices_spx.merge(
|
331 |
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
332 |
|
333 |
# Features
|
@@ -417,10 +357,10 @@ def get_data(periods_30m = 1):
|
|
417 |
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
418 |
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
419 |
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
|
420 |
-
CloseL1 = lambda x: np.where(x['
|
421 |
-
CloseL2 = lambda x: np.where(x['
|
422 |
-
CloseH1 = lambda x: np.where(x['
|
423 |
-
CloseH2 = lambda x: np.where(x['
|
424 |
)
|
425 |
|
426 |
data['OpenL1'] = data['OpenL1'].shift(-1)
|
@@ -445,7 +385,7 @@ def get_data(periods_30m = 1):
|
|
445 |
|
446 |
for col in level_cols:
|
447 |
data[col+'Pct'] = data[col].rolling(100).mean()
|
448 |
-
data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
449 |
|
450 |
|
451 |
def get_quintiles(df, col_name, q):
|
|
|
3 |
import pandas_datareader as pdr
|
4 |
import numpy as np
|
5 |
import yfinance as yf
|
|
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup
|
8 |
from typing import List
|
|
|
9 |
from tqdm import tqdm
|
|
|
|
|
10 |
import os
|
|
|
11 |
import datetime
|
12 |
from pandas.tseries.offsets import BDay
|
13 |
from datasets import load_dataset
|
14 |
import lightgbm as lgb
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
model_cols = [
|
17 |
'BigNewsDay',
|
18 |
'Quarter',
|
|
|
48 |
# 'GapFillGreenProba'
|
49 |
]
|
50 |
|
51 |
+
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
52 |
+
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
df = df[model_cols + [target_column]]
|
55 |
+
df[target_column] = df[target_column].astype(bool)
|
56 |
+
|
57 |
+
# Model
|
58 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
59 |
|
|
|
|
|
|
|
|
|
|
|
60 |
overall_results = []
|
61 |
# Iterate over the rows in the DataFrame, one step at a time
|
62 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LGB Model'):
|
63 |
# Split the data into training and test sets
|
64 |
+
X_train = df.drop(target_column, axis=1).iloc[:i]
|
65 |
+
y_train = df[target_column].iloc[:i]
|
66 |
+
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
67 |
+
y_test = df[target_column].iloc[i:i+num_periods]
|
68 |
|
69 |
# Fit the model to the training data
|
70 |
+
model.fit(X_train, y_train)
|
71 |
|
72 |
# Make a prediction on the test data
|
73 |
+
predictions = model.predict_proba(X_test)[:,-1]
|
74 |
|
75 |
# Create a DataFrame to store the true and predicted values
|
76 |
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
|
|
77 |
overall_results.append(result_df)
|
78 |
|
79 |
df_results = pd.concat(overall_results)
|
|
|
83 |
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
84 |
|
85 |
greenprobas = []
|
|
|
86 |
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
|
87 |
try:
|
88 |
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
89 |
for q in df_q.index:
|
90 |
if q.left <= pct <= q.right:
|
91 |
p = df_q[q]
|
|
|
92 |
except:
|
93 |
p = None
|
|
|
94 |
|
95 |
greenprobas.append(p)
|
|
|
96 |
|
97 |
df_results['CalibPredicted'] = greenprobas
|
98 |
|
99 |
+
return df_results, model
|
|
|
100 |
|
101 |
+
def seq_predict_proba(df, trained_clf_model):
|
102 |
+
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
|
|
|
|
|
|
|
|
103 |
return clf_pred_proba
|
104 |
|
105 |
def get_data(periods_30m = 1):
|
|
|
238 |
# Rename the columns
|
239 |
df_30m = df_30m[['Open','High','Low','Close']]
|
240 |
|
241 |
+
opens_intra = df_30m.groupby('Datetime')['Open'].head(1)
|
242 |
+
highs_intra = df_30m.groupby('Datetime')['High'].max()
|
243 |
+
lows_intra = df_30m.groupby('Datetime')['Low'].min()
|
244 |
+
closes_intra = df_30m.groupby('Datetime')['Close'].tail(1)
|
245 |
|
246 |
+
df_intra = pd.DataFrame(index=df_30m.index.unique())
|
247 |
+
df_intra['Open'] = opens_intra
|
248 |
+
df_intra['High'] = highs_intra
|
249 |
+
df_intra['Low'] = lows_intra
|
250 |
+
df_intra['Close'] = closes_intra
|
251 |
|
252 |
+
df_intra.columns = ['Open30','High30','Low30','Close30']
|
253 |
|
254 |
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
255 |
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
|
|
267 |
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
268 |
|
269 |
|
270 |
+
data = prices_spx.merge(df_intra, left_index=True, right_index=True)
|
271 |
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
272 |
|
273 |
# Features
|
|
|
357 |
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
358 |
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
359 |
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
|
360 |
+
CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
|
361 |
+
CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
|
362 |
+
CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
|
363 |
+
CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
|
364 |
)
|
365 |
|
366 |
data['OpenL1'] = data['OpenL1'].shift(-1)
|
|
|
385 |
|
386 |
for col in level_cols:
|
387 |
data[col+'Pct'] = data[col].rolling(100).mean()
|
388 |
+
# data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
389 |
|
390 |
|
391 |
def get_quintiles(df, col_name, q):
|