wnstnb commited on
Commit
b35faa8
·
1 Parent(s): dea9584

the great simplification part 1

Browse files
Files changed (6) hide show
  1. app.py +71 -171
  2. model_1h.py +22 -40
  3. model_30m.py +45 -40
  4. model_90m.py +22 -40
  5. model_intra.py +518 -0
  6. troubleshoot_day_model.ipynb +0 -0
app.py CHANGED
@@ -119,34 +119,6 @@ with st.form("choose_model"):
119
 
120
  with st.spinner("Getting new prediction..."):
121
 
122
- model_cols = [
123
- 'BigNewsDay',
124
- 'Quarter',
125
- 'Perf5Day',
126
- 'Perf5Day_n1',
127
- 'DaysGreen',
128
- 'DaysRed',
129
- 'CurrentGap',
130
- 'RangePct',
131
- 'RangePct_n1',
132
- 'RangePct_n2',
133
- 'OHLC4_VIX',
134
- 'OHLC4_VIX_n1',
135
- 'OHLC4_VIX_n2',
136
- 'OpenL1',
137
- 'OpenL2',
138
- 'OpenH1',
139
- 'OpenH2',
140
- 'L1TouchPct',
141
- 'L2TouchPct',
142
- 'H1TouchPct',
143
- 'H2TouchPct',
144
- 'L1BreakPct',
145
- 'L2BreakPct',
146
- 'H1BreakPct',
147
- 'H2BreakPct'
148
- ]
149
-
150
  # Get last row
151
  new_pred = data.loc[final_row, model_cols]
152
 
@@ -187,12 +159,12 @@ with st.form("choose_model"):
187
  elif option == '07:00':
188
  # run30 = st.button('🏃🏽‍♂️ Run')
189
  # if run30:
190
- from model_30m import *
191
 
192
  fname='performance_for_30m_model.csv'
193
 
194
  with st.spinner('Loading data...'):
195
- data, df_final, final_row = get_data()
196
  # st.success("✅ Historical data")
197
 
198
  with st.spinner("Training models..."):
@@ -204,40 +176,6 @@ with st.form("choose_model"):
204
 
205
  with st.spinner("Getting new prediction..."):
206
 
207
- model_cols = [
208
- 'BigNewsDay',
209
- 'Quarter',
210
- 'Perf5Day',
211
- 'Perf5Day_n1',
212
- 'DaysGreen',
213
- 'DaysRed',
214
- 'CurrentHigh30toClose',
215
- 'CurrentLow30toClose',
216
- 'CurrentClose30toClose',
217
- 'CurrentRange30',
218
- 'GapFill30',
219
- 'CurrentGap',
220
- 'RangePct',
221
- 'RangePct_n1',
222
- 'RangePct_n2',
223
- 'OHLC4_VIX',
224
- 'OHLC4_VIX_n1',
225
- 'OHLC4_VIX_n2',
226
- 'OpenL1',
227
- 'OpenL2',
228
- 'OpenH1',
229
- 'OpenH2',
230
- 'L1TouchPct',
231
- 'L2TouchPct',
232
- 'H1TouchPct',
233
- 'H2TouchPct',
234
- 'L1BreakPct',
235
- 'L2BreakPct',
236
- 'H1BreakPct',
237
- 'H2BreakPct',
238
- 'GreenProbas'
239
- ]
240
-
241
  # Get last row
242
  new_pred = data.loc[final_row, model_cols]
243
 
@@ -284,12 +222,12 @@ with st.form("choose_model"):
284
  elif option == '07:30':
285
  # run60 = st.button('🏃🏽‍♂️ Run')
286
  # if run60:
287
- from model_1h import *
288
 
289
  fname='performance_for_1h_model.csv'
290
 
291
  with st.spinner('Loading data...'):
292
- data, df_final, final_row = get_data()
293
  # st.success("✅ Historical data")
294
 
295
  with st.spinner("Training models..."):
@@ -301,40 +239,6 @@ with st.form("choose_model"):
301
 
302
  with st.spinner("Getting new prediction..."):
303
 
304
- model_cols = [
305
- 'BigNewsDay',
306
- 'Quarter',
307
- 'Perf5Day',
308
- 'Perf5Day_n1',
309
- 'DaysGreen',
310
- 'DaysRed',
311
- 'CurrentHigh30toClose',
312
- 'CurrentLow30toClose',
313
- 'CurrentClose30toClose',
314
- 'CurrentRange30',
315
- 'GapFill30',
316
- 'CurrentGap',
317
- 'RangePct',
318
- 'RangePct_n1',
319
- 'RangePct_n2',
320
- 'OHLC4_VIX',
321
- 'OHLC4_VIX_n1',
322
- 'OHLC4_VIX_n2',
323
- 'OpenL1',
324
- 'OpenL2',
325
- 'OpenH1',
326
- 'OpenH2',
327
- 'L1TouchPct',
328
- 'L2TouchPct',
329
- 'H1TouchPct',
330
- 'H2TouchPct',
331
- 'L1BreakPct',
332
- 'L2BreakPct',
333
- 'H1BreakPct',
334
- 'H2BreakPct',
335
- 'GreenProbas'
336
- ]
337
-
338
  # Get last row
339
  new_pred = data.loc[final_row, model_cols]
340
 
@@ -381,12 +285,12 @@ with st.form("choose_model"):
381
  elif option == '08:00':
382
  # run60 = st.button('🏃🏽‍♂️ Run')
383
  # if run60:
384
- from model_90m import *
385
 
386
  fname='performance_for_90m_model.csv'
387
 
388
  with st.spinner('Loading data...'):
389
- data, df_final, final_row = get_data()
390
  # st.success("✅ Historical data")
391
 
392
  with st.spinner("Training models..."):
@@ -398,40 +302,6 @@ with st.form("choose_model"):
398
 
399
  with st.spinner("Getting new prediction..."):
400
 
401
- model_cols = [
402
- 'BigNewsDay',
403
- 'Quarter',
404
- 'Perf5Day',
405
- 'Perf5Day_n1',
406
- 'DaysGreen',
407
- 'DaysRed',
408
- 'CurrentHigh30toClose',
409
- 'CurrentLow30toClose',
410
- 'CurrentClose30toClose',
411
- 'CurrentRange30',
412
- 'GapFill30',
413
- 'CurrentGap',
414
- 'RangePct',
415
- 'RangePct_n1',
416
- 'RangePct_n2',
417
- 'OHLC4_VIX',
418
- 'OHLC4_VIX_n1',
419
- 'OHLC4_VIX_n2',
420
- 'OpenL1',
421
- 'OpenL2',
422
- 'OpenH1',
423
- 'OpenH2',
424
- 'L1TouchPct',
425
- 'L2TouchPct',
426
- 'H1TouchPct',
427
- 'H2TouchPct',
428
- 'L1BreakPct',
429
- 'L2BreakPct',
430
- 'H1BreakPct',
431
- 'H2BreakPct',
432
- 'GreenProbas'
433
- ]
434
-
435
  # Get last row
436
  new_pred = data.loc[final_row, model_cols]
437
 
@@ -477,9 +347,26 @@ with st.form("choose_model"):
477
 
478
  st.info(f'as of {option} on {curr_date} 👇🏽', icon="🔮")
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  green_proba = seq_proba[0]
481
  red_proba = 1 - green_proba
482
- do_not_play = (seq_proba[0] > 0.4) and (seq_proba[0] <= 0.6)
483
  stdev = 0.01
484
  score = None
485
  num_obs = None
@@ -487,14 +374,23 @@ with st.form("choose_model"):
487
  historical_proba = None
488
  text_cond = None
489
  operator = None
 
 
 
 
 
 
 
 
 
490
 
491
  if do_not_play:
492
  text_cond = '🟨'
493
  operator = ''
494
  score = seq_proba[0]
495
- cond = (res1['Predicted'] > 0.4) & (res1['Predicted'] <= 0.6)
496
- num_obs = len(res1.loc[cond])
497
- historical_proba = res1.loc[cond, 'True'].mean()
498
 
499
 
500
  elif green_proba > red_proba:
@@ -504,9 +400,9 @@ with st.form("choose_model"):
504
  score = green_proba
505
  # How many with this score?
506
  cond = (res1['Predicted'] >= green_proba)
507
- num_obs = len(res1.loc[cond])
508
  # How often green?
509
- historical_proba = res1.loc[cond, 'True'].mean()
510
  # print(cond)
511
 
512
  elif green_proba <= red_proba:
@@ -516,9 +412,9 @@ with st.form("choose_model"):
516
  score = red_proba
517
  # How many with this score?
518
  cond = (res1['Predicted'] <= seq_proba[0])
519
- num_obs = len(res1.loc[cond])
520
  # How often green?
521
- historical_proba = 1 - res1.loc[cond, 'True'].mean()
522
  # print(cond)
523
 
524
  score_fmt = f'{score:.1%}'
@@ -536,8 +432,8 @@ with st.form("choose_model"):
536
  index=['Results'],
537
  data = {
538
  'Confidence':[f'{text_cond} {score:.1%}'],
539
- 'Success':[f'{historical_proba:.1%}'],
540
- f'N{operator}{score:.1%}':[f'{num_obs} / {len(data)}'],
541
  'Prev / Curr':[f'{prev_close:.2f} / {curr_close:.2f}']
542
  })
543
 
@@ -551,26 +447,16 @@ with st.form("choose_model"):
551
 
552
  prices.columns = ['']
553
 
554
- # st.subheader('New Prediction')
555
-
556
- int_labels = ['(0, .20]', '(.20, .40]', '(.40, .60]', '(.60, .80]', '(.80, .1]']
557
- # df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
558
-
559
- data['ClosePct'] = (data['Close'] / data['PrevClose']) - 1
560
- data['ClosePct'] = data['ClosePct'].shift(-1)
561
- res1 = res1.merge(data['ClosePct'], left_index=True,right_index=True)
562
- df_probas = res1.groupby(pd.cut(res1['Predicted'], bins = [-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf], labels = int_labels)).agg({'True':[np.mean,len,np.sum],'ClosePct':[np.mean]})
563
- df_probas.columns = ['PctGreen','NumObs','NumGreen','AvgPerf']
564
- df_probas['AvgPerf'] = df_probas['AvgPerf'].apply(lambda x: f'{x:.2%}')
565
-
566
  roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
 
567
  precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
568
  recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
569
  len_all = len(res1)
570
 
571
- res2_filtered = res1.loc[(res1['Predicted'] > 0.6) | (res1['Predicted'] <= 0.4)]
572
 
573
  roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
 
574
  precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
575
  recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
576
  len_hi = len(res2_filtered)
@@ -579,6 +465,7 @@ with st.form("choose_model"):
579
  index=[
580
  'N',
581
  'ROC AUC',
 
582
  'Precision',
583
  'Recall'
584
  ],
@@ -589,31 +476,44 @@ with st.form("choose_model"):
589
  data = [
590
  [len_all, len_hi],
591
  [roc_auc_score_all, roc_auc_score_hi],
 
592
  [precision_score_all, precision_score_hi],
593
  [recall_score_all, recall_score_hi]
594
  ]
595
  ).round(2)
596
 
597
  def get_acc(t, p):
598
- if t == False and p <= 0.4:
599
- return '✅'
600
- elif t == True and p > 0.6:
601
- return '✅'
602
- elif t == False and p > 0.6:
603
- return '❌'
604
- elif t == True and p <= 0.4:
605
  return '❌'
606
  else:
607
- return '🟨'
608
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  def get_acc_text(t, p):
610
- if t == False and p <= 0.4:
611
  return 'Correct'
612
- elif t == True and p > 0.6:
613
  return 'Correct'
614
- elif t == False and p > 0.6:
615
  return 'Incorrect'
616
- elif t == True and p <= 0.4:
617
  return 'Incorrect'
618
  else:
619
  return 'No Action'
 
119
 
120
  with st.spinner("Getting new prediction..."):
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Get last row
123
  new_pred = data.loc[final_row, model_cols]
124
 
 
159
  elif option == '07:00':
160
  # run30 = st.button('🏃🏽‍♂️ Run')
161
  # if run30:
162
+ from model_intra import *
163
 
164
  fname='performance_for_30m_model.csv'
165
 
166
  with st.spinner('Loading data...'):
167
+ data, df_final, final_row = get_data(1)
168
  # st.success("✅ Historical data")
169
 
170
  with st.spinner("Training models..."):
 
176
 
177
  with st.spinner("Getting new prediction..."):
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # Get last row
180
  new_pred = data.loc[final_row, model_cols]
181
 
 
222
  elif option == '07:30':
223
  # run60 = st.button('🏃🏽‍♂️ Run')
224
  # if run60:
225
+ from model_intra import *
226
 
227
  fname='performance_for_1h_model.csv'
228
 
229
  with st.spinner('Loading data...'):
230
+ data, df_final, final_row = get_data(2)
231
  # st.success("✅ Historical data")
232
 
233
  with st.spinner("Training models..."):
 
239
 
240
  with st.spinner("Getting new prediction..."):
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  # Get last row
243
  new_pred = data.loc[final_row, model_cols]
244
 
 
285
  elif option == '08:00':
286
  # run60 = st.button('🏃🏽‍♂️ Run')
287
  # if run60:
288
+ from model_intra import *
289
 
290
  fname='performance_for_90m_model.csv'
291
 
292
  with st.spinner('Loading data...'):
293
+ data, df_final, final_row = get_data(3)
294
  # st.success("✅ Historical data")
295
 
296
  with st.spinner("Training models..."):
 
302
 
303
  with st.spinner("Getting new prediction..."):
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  # Get last row
306
  new_pred = data.loc[final_row, model_cols]
307
 
 
347
 
348
  st.info(f'as of {option} on {curr_date} 👇🏽', icon="🔮")
349
 
350
+ # st.subheader('New Prediction')
351
+
352
+ # int_labels = ['(0, .20]', '(.20, .40]', '(.40, .60]', '(.60, .80]', '(.80, .1]']
353
+ # df_probas = res1.groupby(pd.qcut(res1['Predicted'],5)).agg({'True':[np.mean,len,np.sum]})
354
+ _q = 7
355
+ lo_thres = 0.4 # res1.loc[middle_quantiles, 'Predicted'].min()
356
+ hi_thres = 0.6 # res1.loc[middle_quantiles, 'Predicted'].max()
357
+
358
+ data['ClosePct'] = (data['Close'] / data['PrevClose']) - 1
359
+ data['ClosePct'] = data['ClosePct'].shift(-1)
360
+ res1 = res1.merge(data['ClosePct'], left_index=True,right_index=True)
361
+ # df_probas = res1.groupby(pd.cut(res1['Predicted'], bins = [-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf], labels = int_labels)).agg({'True':[np.mean,len,np.sum],'ClosePct':[np.mean]})
362
+ df_probas = res1.groupby(pd.cut(res1['Predicted'], _q)).agg({'True':[np.mean,len,np.sum],'ClosePct':[np.mean]})
363
+
364
+ df_probas.columns = ['PctGreen','NumObs','NumGreen','AvgPerf']
365
+ df_probas['AvgPerf'] = df_probas['AvgPerf'].apply(lambda x: f'{x:.2%}')
366
+
367
  green_proba = seq_proba[0]
368
  red_proba = 1 - green_proba
369
+ do_not_play = (seq_proba[0] > lo_thres) and (seq_proba[0] <= hi_thres)
370
  stdev = 0.01
371
  score = None
372
  num_obs = None
 
374
  historical_proba = None
375
  text_cond = None
376
  operator = None
377
+ intv = None
378
+ for q in df_probas.index:
379
+ if q.left <= green_proba <= q.right:
380
+ historical_proba = df_probas.loc[q, 'PctGreen']
381
+ num_obs = df_probas.loc[q, 'NumObs']
382
+ intv = f'({q.left:.03f}, {q.right:.03f}])'
383
+
384
+ qs = [f'({q.left:.2f}, {q.right:.2f}]' for q in df_probas.index]
385
+ df_probas.index = qs
386
 
387
  if do_not_play:
388
  text_cond = '🟨'
389
  operator = ''
390
  score = seq_proba[0]
391
+ cond = (res1['Predicted'] > lo_thres) & (res1['Predicted'] <= hi_thres)
392
+ # num_obs = len(res1.loc[cond])
393
+ # historical_proba = res1.loc[cond, 'True'].mean()
394
 
395
 
396
  elif green_proba > red_proba:
 
400
  score = green_proba
401
  # How many with this score?
402
  cond = (res1['Predicted'] >= green_proba)
403
+ # num_obs = len(res1.loc[cond])
404
  # How often green?
405
+ # historical_proba = res1.loc[cond, 'True'].mean()
406
  # print(cond)
407
 
408
  elif green_proba <= red_proba:
 
412
  score = red_proba
413
  # How many with this score?
414
  cond = (res1['Predicted'] <= seq_proba[0])
415
+ # num_obs = len(res1.loc[cond])
416
  # How often green?
417
+ # historical_proba = 1 - res1.loc[cond, 'True'].mean()
418
  # print(cond)
419
 
420
  score_fmt = f'{score:.1%}'
 
432
  index=['Results'],
433
  data = {
434
  'Confidence':[f'{text_cond} {score:.1%}'],
435
+ 'Calib. Proba':[f'{historical_proba:.1%}'],
436
+ f'{intv}':[f'{num_obs}'],
437
  'Prev / Curr':[f'{prev_close:.2f} / {curr_close:.2f}']
438
  })
439
 
 
447
 
448
  prices.columns = ['']
449
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  roc_auc_score_all = roc_auc_score(res1['True'].astype(int), res1['Predicted'].values)
451
+ roc_auc_score_calib = roc_auc_score(res1.dropna(subset='CalibGreenProba')['True'].astype(int), res1.dropna(subset='CalibGreenProba')['CalibGreenProba'].values)
452
  precision_score_all = precision_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
453
  recall_score_all = recall_score(res1['True'].astype(int), res1['Predicted'] > 0.5)
454
  len_all = len(res1)
455
 
456
+ res2_filtered = res1.loc[(res1['Predicted'] > hi_thres) | (res1['Predicted'] <= lo_thres)]
457
 
458
  roc_auc_score_hi = roc_auc_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'].values)
459
+ roc_auc_score_hi_calib = roc_auc_score(res2_filtered.dropna(subset='CalibGreenProba')['True'].astype(int), res2_filtered.dropna(subset='CalibGreenProba')['CalibGreenProba'].values)
460
  precision_score_hi = precision_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
461
  recall_score_hi = recall_score(res2_filtered['True'].astype(int), res2_filtered['Predicted'] > 0.5)
462
  len_hi = len(res2_filtered)
 
465
  index=[
466
  'N',
467
  'ROC AUC',
468
+ 'Calib. AUC',
469
  'Precision',
470
  'Recall'
471
  ],
 
476
  data = [
477
  [len_all, len_hi],
478
  [roc_auc_score_all, roc_auc_score_hi],
479
+ [roc_auc_score_calib, roc_auc_score_hi_calib],
480
  [precision_score_all, precision_score_hi],
481
  [recall_score_all, recall_score_hi]
482
  ]
483
  ).round(2)
484
 
485
  def get_acc(t, p):
486
+ if t == False and p <= lo_thres:
487
+ return '✅' # &#9989;</p>
488
+ elif t == True and p > hi_thres:
489
+ return '✅' #
490
+ elif t == False and p > hi_thres:
491
+ return '❌' # &#10060;</p>
492
+ elif t == True and p <= lo_thres:
493
  return '❌'
494
  else:
495
+ return '🟨' # &#11036;</p>
496
 
497
+ def get_acc_html(t, p):
498
+ if t == False and p <= lo_thres:
499
+ return '&#9989;'
500
+ elif t == True and p > hi_thres:
501
+ return '&#9989;'
502
+ elif t == False and p > hi_thres:
503
+ return '&#10060;'
504
+ elif t == True and p <= lo_thres:
505
+ return '&#10060;'
506
+ else:
507
+ return '&#11036;'
508
+
509
  def get_acc_text(t, p):
510
+ if t == False and p <= lo_thres:
511
  return 'Correct'
512
+ elif t == True and p > hi_thres:
513
  return 'Correct'
514
+ elif t == False and p > hi_thres:
515
  return 'Incorrect'
516
+ elif t == True and p <= lo_thres:
517
  return 'Incorrect'
518
  else:
519
  return 'No Action'
model_1h.py CHANGED
@@ -80,7 +80,9 @@ model_cols = [
80
  'L2BreakPct',
81
  'H1BreakPct',
82
  'H2BreakPct',
83
- 'GreenProbas'
 
 
84
  ]
85
 
86
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
@@ -334,6 +336,7 @@ def get_data():
334
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
335
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
336
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
 
337
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
338
  data['DayOfWeek'] = pd.to_datetime(data.index)
339
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
@@ -388,6 +391,7 @@ def get_data():
388
  data['CurrentHigh30'] = data['High30'].shift(-1)
389
  data['CurrentLow30'] = data['Low30'].shift(-1)
390
  data['CurrentClose30'] = data['Close30'].shift(-1)
 
391
 
392
  # Open to High
393
  data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
@@ -408,13 +412,13 @@ def get_data():
408
  data['Quarter'] = data['DayOfWeek'].dt.quarter
409
  data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
410
 
411
- def get_quintiles(df):
412
- return df.groupby(pd.qcut(df['CurrentClose30toClose'], 5))['GreenDay'].mean()
413
 
414
  probas = []
415
  for i, pct in enumerate(data['CurrentClose30toClose']):
416
  try:
417
- df_q = get_quintiles(data.iloc[:i])
418
  for q in df_q.index:
419
  if q.left <= pct <= q.right:
420
  p = df_q[q]
@@ -423,7 +427,20 @@ def get_data():
423
 
424
  probas.append(p)
425
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  data['GreenProbas'] = probas
 
427
 
428
  for rid in tqdm(release_ids, desc='Merging econ data'):
429
  # Get the name of the release
@@ -459,41 +476,6 @@ def get_data():
459
 
460
  exp_row = data.index[-1]
461
 
462
- df_final = data.loc[:final_row,
463
- [
464
- 'BigNewsDay',
465
- 'Quarter',
466
- 'Perf5Day',
467
- 'Perf5Day_n1',
468
- 'DaysGreen',
469
- 'DaysRed',
470
- 'CurrentHigh30toClose',
471
- 'CurrentLow30toClose',
472
- 'CurrentClose30toClose',
473
- 'CurrentRange30',
474
- 'GapFill30',
475
- 'CurrentGap',
476
- 'RangePct',
477
- 'RangePct_n1',
478
- 'RangePct_n2',
479
- 'OHLC4_VIX',
480
- 'OHLC4_VIX_n1',
481
- 'OHLC4_VIX_n2',
482
- 'OpenL1',
483
- 'OpenL2',
484
- 'OpenH1',
485
- 'OpenH2',
486
- 'L1TouchPct',
487
- 'L2TouchPct',
488
- 'H1TouchPct',
489
- 'H2TouchPct',
490
- 'L1BreakPct',
491
- 'L2BreakPct',
492
- 'H1BreakPct',
493
- 'H2BreakPct',
494
- 'GreenProbas',
495
- 'Target',
496
- 'Target_clf'
497
- ]]
498
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
499
  return data, df_final, final_row
 
80
  'L2BreakPct',
81
  'H1BreakPct',
82
  'H2BreakPct',
83
+ 'GreenProbas',
84
+ # 'GapFillGreenProba'
85
+
86
  ]
87
 
88
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
 
336
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
337
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
338
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
339
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
340
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
341
  data['DayOfWeek'] = pd.to_datetime(data.index)
342
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
 
391
  data['CurrentHigh30'] = data['High30'].shift(-1)
392
  data['CurrentLow30'] = data['Low30'].shift(-1)
393
  data['CurrentClose30'] = data['Close30'].shift(-1)
394
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
395
 
396
  # Open to High
397
  data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
 
412
  data['Quarter'] = data['DayOfWeek'].dt.quarter
413
  data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
414
 
415
+ def get_quintiles(df, col_name, q):
416
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
417
 
418
  probas = []
419
  for i, pct in enumerate(data['CurrentClose30toClose']):
420
  try:
421
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
422
  for q in df_q.index:
423
  if q.left <= pct <= q.right:
424
  p = df_q[q]
 
427
 
428
  probas.append(p)
429
 
430
+ # gapfills = []
431
+ # for i, pct in enumerate(data['CurrentGap']):
432
+ # try:
433
+ # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
434
+ # for q in df_q.index:
435
+ # if q.left <= pct <= q.right:
436
+ # p = df_q[q]
437
+ # except:
438
+ # p = None
439
+
440
+ # gapfills.append(p)
441
+
442
  data['GreenProbas'] = probas
443
+ # data['GapFillGreenProba'] = gapfills
444
 
445
  for rid in tqdm(release_ids, desc='Merging econ data'):
446
  # Get the name of the release
 
476
 
477
  exp_row = data.index[-1]
478
 
479
+ df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
481
  return data, df_final, final_row
model_30m.py CHANGED
@@ -81,7 +81,8 @@ model_cols = [
81
  'L2BreakPct',
82
  'H1BreakPct',
83
  'H2BreakPct',
84
- 'GreenProbas'
 
85
  ]
86
 
87
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
@@ -127,6 +128,30 @@ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_t
127
  overall_results.append(result_df)
128
 
129
  df_results = pd.concat(overall_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return df_results, model1, model2
131
 
132
 
@@ -320,6 +345,7 @@ def get_data():
320
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
321
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
322
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
 
323
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
324
  data['DayOfWeek'] = pd.to_datetime(data.index)
325
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
@@ -329,6 +355,7 @@ def get_data():
329
  data['CurrentHigh30'] = data['High30'].shift(-1)
330
  data['CurrentLow30'] = data['Low30'].shift(-1)
331
  data['CurrentClose30'] = data['Close30'].shift(-1)
 
332
 
333
 
334
  # Open to High
@@ -410,13 +437,13 @@ def get_data():
410
  data[col+'Pct'] = data[col+'Pct'].shift(-1)
411
 
412
 
413
- def get_quintiles(df):
414
- return df.groupby(pd.qcut(df['CurrentClose30toClose'], 5))['GreenDay'].mean()
415
 
416
  probas = []
417
  for i, pct in enumerate(data['CurrentClose30toClose']):
418
  try:
419
- df_q = get_quintiles(data.iloc[:i])
420
  for q in df_q.index:
421
  if q.left <= pct <= q.right:
422
  p = df_q[q]
@@ -425,7 +452,20 @@ def get_data():
425
 
426
  probas.append(p)
427
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  data['GreenProbas'] = probas
 
429
 
430
  for rid in tqdm(release_ids, desc='Merging econ data'):
431
  # Get the name of the release
@@ -461,41 +501,6 @@ def get_data():
461
 
462
  exp_row = data.index[-1]
463
 
464
- df_final = data.loc[:final_row,
465
- [
466
- 'BigNewsDay',
467
- 'Quarter',
468
- 'Perf5Day',
469
- 'Perf5Day_n1',
470
- 'DaysGreen',
471
- 'DaysRed',
472
- 'CurrentHigh30toClose',
473
- 'CurrentLow30toClose',
474
- 'CurrentClose30toClose',
475
- 'CurrentRange30',
476
- 'GapFill30',
477
- 'CurrentGap',
478
- 'RangePct',
479
- 'RangePct_n1',
480
- 'RangePct_n2',
481
- 'OHLC4_VIX',
482
- 'OHLC4_VIX_n1',
483
- 'OHLC4_VIX_n2',
484
- 'OpenL1',
485
- 'OpenL2',
486
- 'OpenH1',
487
- 'OpenH2',
488
- 'L1TouchPct',
489
- 'L2TouchPct',
490
- 'H1TouchPct',
491
- 'H2TouchPct',
492
- 'L1BreakPct',
493
- 'L2BreakPct',
494
- 'H1BreakPct',
495
- 'H2BreakPct',
496
- 'GreenProbas',
497
- 'Target',
498
- 'Target_clf'
499
- ]]
500
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
501
  return data, df_final, final_row
 
81
  'L2BreakPct',
82
  'H1BreakPct',
83
  'H2BreakPct',
84
+ 'GreenProbas',
85
+ # 'GapFillGreenProba'
86
  ]
87
 
88
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
 
128
  overall_results.append(result_df)
129
 
130
  df_results = pd.concat(overall_results)
131
+
132
+ # Calibrate Probabilities
133
+ def get_quantiles(df, col_name, q):
134
+ return df.groupby(pd.qcut(df[col_name], q))['True'].mean()
135
+
136
+ greenprobas = []
137
+ meanprobas = []
138
+ for i, pct in enumerate(df_results['Predicted']):
139
+ try:
140
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
141
+ for q in df_q.index:
142
+ if q.left <= pct <= q.right:
143
+ p = df_q[q]
144
+ c = (q.left + q.right) / 2
145
+ except:
146
+ p = None
147
+ c = None
148
+
149
+ greenprobas.append(p)
150
+ meanprobas.append(c)
151
+
152
+ df_results['CalibPredicted'] = meanprobas
153
+ df_results['CalibGreenProba'] = greenprobas
154
+
155
  return df_results, model1, model2
156
 
157
 
 
345
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
346
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
347
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
348
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
349
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
350
  data['DayOfWeek'] = pd.to_datetime(data.index)
351
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
 
355
  data['CurrentHigh30'] = data['High30'].shift(-1)
356
  data['CurrentLow30'] = data['Low30'].shift(-1)
357
  data['CurrentClose30'] = data['Close30'].shift(-1)
358
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
359
 
360
 
361
  # Open to High
 
437
  data[col+'Pct'] = data[col+'Pct'].shift(-1)
438
 
439
 
440
+ def get_quintiles(df, col_name, q):
441
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
442
 
443
  probas = []
444
  for i, pct in enumerate(data['CurrentClose30toClose']):
445
  try:
446
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
447
  for q in df_q.index:
448
  if q.left <= pct <= q.right:
449
  p = df_q[q]
 
452
 
453
  probas.append(p)
454
 
455
+ # gapfills = []
456
+ # for i, pct in enumerate(data['CurrentGap']):
457
+ # try:
458
+ # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
459
+ # for q in df_q.index:
460
+ # if q.left <= pct <= q.right:
461
+ # p = df_q[q]
462
+ # except:
463
+ # p = None
464
+
465
+ # gapfills.append(p)
466
+
467
  data['GreenProbas'] = probas
468
+ # data['GapFillGreenProba'] = gapfills
469
 
470
  for rid in tqdm(release_ids, desc='Merging econ data'):
471
  # Get the name of the release
 
501
 
502
  exp_row = data.index[-1]
503
 
504
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
506
  return data, df_final, final_row
model_90m.py CHANGED
@@ -80,7 +80,8 @@ model_cols = [
80
  'L2BreakPct',
81
  'H1BreakPct',
82
  'H2BreakPct',
83
- 'GreenProbas'
 
84
  ]
85
 
86
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
@@ -334,6 +335,7 @@ def get_data():
334
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
335
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
336
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
 
337
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
338
  data['DayOfWeek'] = pd.to_datetime(data.index)
339
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
@@ -342,6 +344,8 @@ def get_data():
342
  data['CurrentHigh30'] = data['High30'].shift(-1)
343
  data['CurrentLow30'] = data['Low30'].shift(-1)
344
  data['CurrentClose30'] = data['Close30'].shift(-1)
 
 
345
 
346
  # Open to High
347
  data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
@@ -408,13 +412,13 @@ def get_data():
408
  data[col+'Pct'] = data[col].rolling(100).mean()
409
  data[col+'Pct'] = data[col+'Pct'].shift(-1)
410
 
411
- def get_quintiles(df):
412
- return df.groupby(pd.qcut(df['CurrentClose30toClose'], 5))['GreenDay'].mean()
413
 
414
  probas = []
415
  for i, pct in enumerate(data['CurrentClose30toClose']):
416
  try:
417
- df_q = get_quintiles(data.iloc[:i])
418
  for q in df_q.index:
419
  if q.left <= pct <= q.right:
420
  p = df_q[q]
@@ -423,7 +427,20 @@ def get_data():
423
 
424
  probas.append(p)
425
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  data['GreenProbas'] = probas
 
427
 
428
  for rid in tqdm(release_ids, desc='Merging econ data'):
429
  # Get the name of the release
@@ -459,41 +476,6 @@ def get_data():
459
 
460
  exp_row = data.index[-1]
461
 
462
- df_final = data.loc[:final_row,
463
- [
464
- 'BigNewsDay',
465
- 'Quarter',
466
- 'Perf5Day',
467
- 'Perf5Day_n1',
468
- 'DaysGreen',
469
- 'DaysRed',
470
- 'CurrentHigh30toClose',
471
- 'CurrentLow30toClose',
472
- 'CurrentClose30toClose',
473
- 'CurrentRange30',
474
- 'GapFill30',
475
- 'CurrentGap',
476
- 'RangePct',
477
- 'RangePct_n1',
478
- 'RangePct_n2',
479
- 'OHLC4_VIX',
480
- 'OHLC4_VIX_n1',
481
- 'OHLC4_VIX_n2',
482
- 'OpenL1',
483
- 'OpenL2',
484
- 'OpenH1',
485
- 'OpenH2',
486
- 'L1TouchPct',
487
- 'L2TouchPct',
488
- 'H1TouchPct',
489
- 'H2TouchPct',
490
- 'L1BreakPct',
491
- 'L2BreakPct',
492
- 'H1BreakPct',
493
- 'H2BreakPct',
494
- 'GreenProbas',
495
- 'Target',
496
- 'Target_clf'
497
- ]]
498
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
499
  return data, df_final, final_row
 
80
  'L2BreakPct',
81
  'H1BreakPct',
82
  'H2BreakPct',
83
+ 'GreenProbas',
84
+ # 'GapFillGreenProba'
85
  ]
86
 
87
  def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
 
335
  data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
336
  data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
337
  data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
338
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
339
  data['CurrentGap'] = data['CurrentGap'].shift(-1)
340
  data['DayOfWeek'] = pd.to_datetime(data.index)
341
  data['DayOfWeek'] = data['DayOfWeek'].dt.day
 
344
  data['CurrentHigh30'] = data['High30'].shift(-1)
345
  data['CurrentLow30'] = data['Low30'].shift(-1)
346
  data['CurrentClose30'] = data['Close30'].shift(-1)
347
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
348
+
349
 
350
  # Open to High
351
  data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
 
412
  data[col+'Pct'] = data[col].rolling(100).mean()
413
  data[col+'Pct'] = data[col+'Pct'].shift(-1)
414
 
415
+ def get_quintiles(df, col_name, q):
416
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
417
 
418
  probas = []
419
  for i, pct in enumerate(data['CurrentClose30toClose']):
420
  try:
421
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
422
  for q in df_q.index:
423
  if q.left <= pct <= q.right:
424
  p = df_q[q]
 
427
 
428
  probas.append(p)
429
 
430
+ # gapfills = []
431
+ # for i, pct in enumerate(data['CurrentGap']):
432
+ # try:
433
+ # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
434
+ # for q in df_q.index:
435
+ # if q.left <= pct <= q.right:
436
+ # p = df_q[q]
437
+ # except:
438
+ # p = None
439
+
440
+ # gapfills.append(p)
441
+
442
  data['GreenProbas'] = probas
443
+ # data['GapFillGreenProba'] = gapfills
444
 
445
  for rid in tqdm(release_ids, desc='Merging econ data'):
446
  # Get the name of the release
 
476
 
477
  exp_row = data.index[-1]
478
 
479
+ df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
481
  return data, df_final, final_row
model_intra.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import json
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from typing import List
10
+ import xgboost as xgb
11
+ from tqdm import tqdm
12
+ from sklearn import linear_model
13
+ import joblib
14
+ import os
15
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
+ import datetime
17
+ from pandas.tseries.offsets import BDay
18
+ from datasets import load_dataset
19
+ import lightgbm as lgb
20
+
21
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
22
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
23
+
24
+ # Create an XGBRegressor model
25
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
26
+ model = linear_model.LinearRegression()
27
+
28
+ overall_results = []
29
+ # Iterate over the rows in the DataFrame, one step at a time
30
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
31
+ # Split the data into training and test sets
32
+ X_train = df.drop(target_column, axis=1).iloc[:i]
33
+ y_train = df[target_column].iloc[:i]
34
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
35
+ y_test = df[target_column].iloc[i:i+num_periods]
36
+
37
+ # Fit the model to the training data
38
+ model.fit(X_train, y_train)
39
+
40
+ # Make a prediction on the test data
41
+ predictions = model.predict(X_test)
42
+
43
+ # Create a DataFrame to store the true and predicted values
44
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
45
+
46
+ overall_results.append(result_df)
47
+
48
+ df_results = pd.concat(overall_results)
49
+ # model.save_model('model_lr.bin')
50
+ # Return the true and predicted values, and fitted model
51
+ return df_results, model
52
+
53
+ model_cols = [
54
+ 'BigNewsDay',
55
+ 'Quarter',
56
+ 'Perf5Day',
57
+ 'Perf5Day_n1',
58
+ 'DaysGreen',
59
+ 'DaysRed',
60
+ 'CurrentHigh30toClose',
61
+ 'CurrentLow30toClose',
62
+ 'CurrentClose30toClose',
63
+ 'CurrentRange30',
64
+ 'GapFill30',
65
+ 'CurrentGap',
66
+ 'RangePct',
67
+ 'RangePct_n1',
68
+ 'RangePct_n2',
69
+ 'OHLC4_VIX',
70
+ 'OHLC4_VIX_n1',
71
+ 'OHLC4_VIX_n2',
72
+ 'OpenL1',
73
+ 'OpenL2',
74
+ 'OpenH1',
75
+ 'OpenH2',
76
+ 'L1TouchPct',
77
+ 'L2TouchPct',
78
+ 'H1TouchPct',
79
+ 'H2TouchPct',
80
+ 'L1BreakPct',
81
+ 'L2BreakPct',
82
+ 'H1BreakPct',
83
+ 'H2BreakPct',
84
+ 'GreenProbas',
85
+ # 'GapFillGreenProba'
86
+ ]
87
+
88
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
89
+
90
+ # Create run the regression model to get its target
91
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
92
+ # joblib.dump(model1, 'model1.bin')
93
+
94
+ # Merge the result df back on the df for feeding into the classifier
95
+ for_merge = res[['Predicted']]
96
+ for_merge.columns = ['RegrModelOut']
97
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
98
+ df = df.merge(for_merge, left_index=True, right_index=True)
99
+ df = df.drop(columns=[target_column_regr])
100
+ df = df[model_cols + ['RegrModelOut', target_column_clf]]
101
+
102
+ df[target_column_clf] = df[target_column_clf].astype(bool)
103
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
104
+
105
+ # Create an XGBRegressor model
106
+ # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
107
+ model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
108
+ # model = linear_model.LogisticRegression(max_iter=1500)
109
+
110
+ overall_results = []
111
+ # Iterate over the rows in the DataFrame, one step at a time
112
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
113
+ # Split the data into training and test sets
114
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
115
+ y_train = df[target_column_clf].iloc[:i]
116
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
117
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
118
+
119
+ # Fit the model to the training data
120
+ model2.fit(X_train, y_train)
121
+
122
+ # Make a prediction on the test data
123
+ predictions = model2.predict_proba(X_test)[:,-1]
124
+
125
+ # Create a DataFrame to store the true and predicted values
126
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
127
+
128
+ overall_results.append(result_df)
129
+
130
+ df_results = pd.concat(overall_results)
131
+
132
+ # Calibrate Probabilities
133
+ def get_quantiles(df, col_name, q):
134
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
135
+
136
+ greenprobas = []
137
+ meanprobas = []
138
+ for i, pct in enumerate(df_results['Predicted']):
139
+ try:
140
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
141
+ for q in df_q.index:
142
+ if q.left <= pct <= q.right:
143
+ p = df_q[q]
144
+ c = (q.left + q.right) / 2
145
+ except:
146
+ p = None
147
+ c = None
148
+
149
+ greenprobas.append(p)
150
+ meanprobas.append(c)
151
+
152
+ df_results['CalibPredicted'] = meanprobas
153
+ df_results['CalibGreenProba'] = greenprobas
154
+
155
+ return df_results, model1, model2
156
+
157
+
158
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
159
+ regr_pred = trained_reg_model.predict(df)
160
+ regr_pred = regr_pred > 0
161
+ new_df = df.copy()
162
+ new_df['RegrModelOut'] = regr_pred
163
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
164
+ return clf_pred_proba
165
+
166
+ def get_data(periods_30m = 1):
167
+ # f = open('settings.json')
168
+ # j = json.load(f)
169
+ # API_KEY_FRED = j["API_KEY_FRED"]
170
+
171
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
172
+
173
+ def parse_release_dates(release_id: str) -> List[str]:
174
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
175
+ r = requests.get(release_dates_url)
176
+ text = r.text
177
+ soup = BeautifulSoup(text, 'xml')
178
+ dates = []
179
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
180
+ dates.append(release_date_tag.text)
181
+ return dates
182
+
183
+ def parse_release_dates_obs(series_id: str) -> List[str]:
184
+ obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
185
+ r = requests.get(obs_url)
186
+ text = r.text
187
+ soup = BeautifulSoup(text, 'xml')
188
+ observations = []
189
+ for observation_tag in soup.find_all('observation'):
190
+ date = observation_tag.get('date')
191
+ value = observation_tag.get('value')
192
+ observations.append((date, value))
193
+ return observations
194
+
195
+ econ_dfs = {}
196
+
197
+ econ_tickers = [
198
+ 'WALCL',
199
+ 'NFCI',
200
+ 'WRESBAL'
201
+ ]
202
+
203
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
204
+ # p = parse_release_dates_obs(et)
205
+ # df = pd.DataFrame(columns = ['ds',et], data = p)
206
+ df = pdr.get_data_fred(et)
207
+ df.index = df.index.rename('ds')
208
+ # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
209
+ # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
210
+ econ_dfs[et] = df
211
+
212
+ # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
213
+ # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
214
+
215
+ # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
216
+ # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
217
+
218
+ release_ids = [
219
+ "10", # "Consumer Price Index"
220
+ "46", # "Producer Price Index"
221
+ "50", # "Employment Situation"
222
+ "53", # "Gross Domestic Product"
223
+ "103", # "Discount Rate Meeting Minutes"
224
+ "180", # "Unemployment Insurance Weekly Claims Report"
225
+ "194", # "ADP National Employment Report"
226
+ "323" # "Trimmed Mean PCE Inflation Rate"
227
+ ]
228
+
229
+ release_names = [
230
+ "CPI",
231
+ "PPI",
232
+ "NFP",
233
+ "GDP",
234
+ "FOMC",
235
+ "UNEMP",
236
+ "ADP",
237
+ "PCE"
238
+ ]
239
+
240
+ releases = {}
241
+
242
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
243
+ releases[rid] = {}
244
+ releases[rid]['dates'] = parse_release_dates(rid)
245
+ releases[rid]['name'] = n
246
+
247
+ # Create a DF that has all dates with the name of the col as 1
248
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
249
+ # This column serves as the true/false indicator of whether there was economic data released that day.
250
+ for rid in tqdm(release_ids, desc='Making indicators'):
251
+ releases[rid]['df'] = pd.DataFrame(
252
+ index=releases[rid]['dates'],
253
+ data={
254
+ releases[rid]['name']: 1
255
+ })
256
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
257
+ # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
258
+ # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
259
+
260
+ vix = yf.Ticker('^VIX')
261
+ spx = yf.Ticker('^GSPC')
262
+
263
+ # Pull in data
264
+ data = load_dataset("boomsss/spx_intra", split='train')
265
+
266
+ rows = [d['text'] for d in data]
267
+ rows = [x.split(',') for x in rows]
268
+
269
+ fr = pd.DataFrame(columns=[
270
+ 'Datetime','Open','High','Low','Close'
271
+ ], data = rows)
272
+
273
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
274
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
275
+ fr = fr.set_index('Datetime')
276
+ fr['Open'] = pd.to_numeric(fr['Open'])
277
+ fr['High'] = pd.to_numeric(fr['High'])
278
+ fr['Low'] = pd.to_numeric(fr['Low'])
279
+ fr['Close'] = pd.to_numeric(fr['Close'])
280
+
281
+ # Get incremental date
282
+ last_date = fr.index.date[-1]
283
+ last_date = last_date + datetime.timedelta(days=1)
284
+ # Get incremental data
285
+ spx1 = yf.Ticker('^GSPC')
286
+ yfp = spx1.history(start=last_date, interval='30m')
287
+
288
+ if len(yfp) > 0:
289
+ # Concat current and incremental
290
+ df_30m = pd.concat([fr, yfp])
291
+ else:
292
+ df_30m = fr.copy()
293
+
294
+ # Get the first 30 minute bar
295
+ df_30m = df_30m.reset_index()
296
+ df_30m['Datetime'] = df_30m['Datetime'].dt.date
297
+ df_30m = df_30m.groupby('Datetime').head(periods_30m)
298
+ df_30m = df_30m.set_index('Datetime',drop=True)
299
+ # Rename the columns
300
+ df_30m = df_30m[['Open','High','Low','Close']]
301
+
302
+ opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
303
+ highs_1h = df_30m.groupby('Datetime')['High'].max()
304
+ lows_1h = df_30m.groupby('Datetime')['Low'].min()
305
+ closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
306
+
307
+ df_1h = pd.DataFrame(index=df_30m.index.unique())
308
+ df_1h['Open'] = opens_1h
309
+ df_1h['High'] = highs_1h
310
+ df_1h['Low'] = lows_1h
311
+ df_1h['Close'] = closes_1h
312
+
313
+ df_1h.columns = ['Open30','High30','Low30','Close30']
314
+
315
+ prices_vix = vix.history(start='2018-07-01', interval='1d')
316
+ prices_spx = spx.history(start='2018-07-01', interval='1d')
317
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
318
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
319
+ prices_spx.index = prices_spx['index']
320
+ prices_spx = prices_spx.drop(columns='index')
321
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
322
+
323
+
324
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
325
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
326
+ prices_vix.index = prices_vix['index']
327
+ prices_vix = prices_vix.drop(columns='index')
328
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
329
+
330
+
331
+ data = prices_spx.merge(df_1h, left_index=True, right_index=True)
332
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
333
+
334
+ # Features
335
+ data['PrevClose'] = data['Close'].shift(1)
336
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
337
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
338
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
339
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
340
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
341
+
342
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
343
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
344
+
345
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
346
+ data['RangePct'] = data['Range'] / data['Close']
347
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
348
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
349
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
350
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
351
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
352
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
353
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
354
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
355
+ data['RangePct_n1'] = data['RangePct'].shift(1)
356
+ data['RangePct_n2'] = data['RangePct'].shift(2)
357
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
358
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
359
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
360
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
361
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
362
+ data['DayOfWeek'] = pd.to_datetime(data.index)
363
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
364
+
365
+ # Intraday features
366
+ data['CurrentOpen30'] = data['Open30'].shift(-1)
367
+ data['CurrentHigh30'] = data['High30'].shift(-1)
368
+ data['CurrentLow30'] = data['Low30'].shift(-1)
369
+ data['CurrentClose30'] = data['Close30'].shift(-1)
370
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
371
+
372
+
373
+ # Open to High
374
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
375
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
376
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
377
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
378
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
379
+
380
+ # Target -- the next day's low
381
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
382
+ data['Target'] = data['Target'].shift(-1)
383
+ # data['Target'] = data['RangePct'].shift(-1)
384
+
385
+ # Target for clf -- whether tomorrow will close above or below today's close
386
+ data['Target_clf'] = data['Close'] > data['PrevClose']
387
+ data['Target_clf'] = data['Target_clf'].shift(-1)
388
+ data['DayOfWeek'] = pd.to_datetime(data.index)
389
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
390
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
391
+
392
+ # Calculate up
393
+ data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
394
+
395
+ # Calculate upSD
396
+ data['upSD'] = data['up'].rolling(30).std(ddof=0)
397
+
398
+ # Calculate aveUp
399
+ data['aveUp'] = data['up'].rolling(30).mean()
400
+ data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
401
+ data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
402
+ data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
403
+ data['downSD'] = data['down'].rolling(30).std(ddof=0)
404
+ data['aveDown'] = data['down'].rolling(30).mean()
405
+ data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
406
+ data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
407
+
408
+ data = data.assign(
409
+ L1Touch = lambda x: x['Low'] < x['L1'],
410
+ L2Touch = lambda x: x['Low'] < x['L2'],
411
+ H1Touch = lambda x: x['High'] > x['H1'],
412
+ H2Touch = lambda x: x['High'] > x['H2'],
413
+ L1Break = lambda x: x['Close'] < x['L1'],
414
+ L2Break = lambda x: x['Close'] < x['L2'],
415
+ H1Break = lambda x: x['Close'] > x['H1'],
416
+ H2Break = lambda x: x['Close'] > x['H2'],
417
+ OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
418
+ OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
419
+ OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
420
+ OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
421
+ CloseL1 = lambda x: np.where(x['Close'] < x['L1'], 1, 0),
422
+ CloseL2 = lambda x: np.where(x['Close'] < x['L2'], 1, 0),
423
+ CloseH1 = lambda x: np.where(x['Close'] > x['H1'], 1, 0),
424
+ CloseH2 = lambda x: np.where(x['Close'] > x['H2'], 1, 0)
425
+ )
426
+
427
+ data['OpenL1'] = data['OpenL1'].shift(-1)
428
+ data['OpenL2'] = data['OpenL2'].shift(-1)
429
+ data['OpenH1'] = data['OpenH1'].shift(-1)
430
+ data['OpenH2'] = data['OpenH2'].shift(-1)
431
+ data['CloseL1'] = data['CloseL1'].shift(-1)
432
+ data['CloseL2'] = data['CloseL2'].shift(-1)
433
+ data['CloseH1'] = data['CloseH1'].shift(-1)
434
+ data['CloseH2'] = data['CloseH2'].shift(-1)
435
+
436
+ level_cols = [
437
+ 'L1Touch',
438
+ 'L2Touch',
439
+ 'H1Touch',
440
+ 'H2Touch',
441
+ 'L1Break',
442
+ 'L2Break',
443
+ 'H1Break',
444
+ 'H2Break'
445
+ ]
446
+
447
+ for col in level_cols:
448
+ data[col+'Pct'] = data[col].rolling(100).mean()
449
+ data[col+'Pct'] = data[col+'Pct'].shift(-1)
450
+
451
+
452
+ def get_quintiles(df, col_name, q):
453
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
454
+
455
+ probas = []
456
+ for i, pct in enumerate(data['CurrentClose30toClose']):
457
+ try:
458
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
459
+ for q in df_q.index:
460
+ if q.left <= pct <= q.right:
461
+ p = df_q[q]
462
+ except:
463
+ p = None
464
+
465
+ probas.append(p)
466
+
467
+ # gapfills = []
468
+ # for i, pct in enumerate(data['CurrentGap']):
469
+ # try:
470
+ # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
471
+ # for q in df_q.index:
472
+ # if q.left <= pct <= q.right:
473
+ # p = df_q[q]
474
+ # except:
475
+ # p = None
476
+
477
+ # gapfills.append(p)
478
+
479
+ data['GreenProbas'] = probas
480
+ # data['GapFillGreenProba'] = gapfills
481
+
482
+ for rid in tqdm(release_ids, desc='Merging econ data'):
483
+ # Get the name of the release
484
+ n = releases[rid]['name']
485
+ # Merge the corresponding DF of the release
486
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
487
+ # Create a column that shifts the value in the merged column up by 1
488
+ data[f'{n}_shift'] = data[n].shift(-1)
489
+ # Fill the rest with zeroes
490
+ data[n] = data[n].fillna(0)
491
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
492
+
493
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
494
+
495
+ def cumul_sum(col):
496
+ nums = []
497
+ s = 0
498
+ for x in col:
499
+ if x == 1:
500
+ s += 1
501
+ elif x == 0:
502
+ s = 0
503
+ nums.append(s)
504
+ return nums
505
+
506
+ consec_green = cumul_sum(data['GreenDay'].values)
507
+ consec_red = cumul_sum(data['RedDay'].values)
508
+
509
+ data['DaysGreen'] = consec_green
510
+ data['DaysRed'] = consec_red
511
+
512
+ final_row = data.index[-2]
513
+
514
+ exp_row = data.index[-1]
515
+
516
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
517
+ df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
518
+ return data, df_final, final_row
troubleshoot_day_model.ipynb CHANGED
The diff for this file is too large to render. See raw diff