File size: 2,708 Bytes
86fa8c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import streamlit as st
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from typing import List
from tqdm import tqdm
import os
import datetime
from pandas.tseries.offsets import BDay
from datasets import load_dataset
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from intraCols import model_cols

# If the dataset is gated/private, make sure you have run huggingface-cli login
def walk_forward_validation(df, target_column, num_periods):
    
    df = df[model_cols + [target_column]]
    df[target_column] = df[target_column].astype(bool)

    # Model
    # model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)

    tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods)  # num_splits is the number of splits you want

    overall_results = []
    # Iterate over the rows in the DataFrame, one step at a time
    # Split the time series data using TimeSeriesSplit
    for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
        # Extract the training and testing data for the current split
        X_train = df.drop(target_column, axis=1).iloc[train_index]
        y_train = df[target_column].iloc[train_index]
        X_test = df.drop(target_column, axis=1).iloc[test_index]
        y_test = df[target_column].iloc[test_index]
    
        y_train = y_train.astype(bool)
        model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
        model.fit(X_train, y_train)
        # Make a prediction on the test data
        predictions = model.predict_proba(X_test)[:,-1]
            
        # Create a DataFrame to store the true and predicted values
        result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
        overall_results.append(result_df)

    df_results = pd.concat(overall_results)

    # Calibrate Probabilities
    def get_quantiles(df, col_name, q):
        return df.groupby(pd.cut(df[col_name], q))['True'].mean()

    greenprobas = []
    for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
        try:
            df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
            for q in df_q.index:
                if q.left <= pct <= q.right:
                    p = df_q[q]
        except:
            p = None

        greenprobas.append(p)

    df_results['CalibPredicted'] = greenprobas

    return df_results, model

def seq_predict_proba(df, trained_clf_model):
    clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
    return clf_pred_proba