Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import pandas_datareader as pdr | |
import numpy as np | |
import yfinance as yf | |
import requests | |
from bs4 import BeautifulSoup | |
from typing import List | |
from tqdm import tqdm | |
import os | |
import datetime | |
from pandas.tseries.offsets import BDay | |
from datasets import load_dataset | |
import lightgbm as lgb | |
from sklearn.model_selection import TimeSeriesSplit | |
from intraCols import model_cols | |
# If the dataset is gated/private, make sure you have run huggingface-cli login | |
def walk_forward_validation(df, target_column, num_periods): | |
df = df[model_cols + [target_column]] | |
df[target_column] = df[target_column].astype(bool) | |
# Model | |
# model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) | |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want | |
overall_results = [] | |
# Iterate over the rows in the DataFrame, one step at a time | |
# Split the time series data using TimeSeriesSplit | |
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits): | |
# Extract the training and testing data for the current split | |
X_train = df.drop(target_column, axis=1).iloc[train_index] | |
y_train = df[target_column].iloc[train_index] | |
X_test = df.drop(target_column, axis=1).iloc[test_index] | |
y_test = df[target_column].iloc[test_index] | |
y_train = y_train.astype(bool) | |
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) | |
model.fit(X_train, y_train) | |
# Make a prediction on the test data | |
predictions = model.predict_proba(X_test)[:,-1] | |
# Create a DataFrame to store the true and predicted values | |
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index) | |
overall_results.append(result_df) | |
df_results = pd.concat(overall_results) | |
# Calibrate Probabilities | |
def get_quantiles(df, col_name, q): | |
return df.groupby(pd.cut(df[col_name], q))['True'].mean() | |
greenprobas = [] | |
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)): | |
try: | |
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7) | |
for q in df_q.index: | |
if q.left <= pct <= q.right: | |
p = df_q[q] | |
except: | |
p = None | |
greenprobas.append(p) | |
df_results['CalibPredicted'] = greenprobas | |
return df_results, model | |
def seq_predict_proba(df, trained_clf_model): | |
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1] | |
return clf_pred_proba |