Spaces:

AlvaroMros
/

ufc-predictor

Running

AlvaroMros commited on 18 days ago

Commit

eb615ca

1 Parent(s): f972c61

Add k-fold cross-validation to prediction pipeline

Introduces a --kfold argument to main.py to enable 3-fold cross-validation. Implements run_kfold_cv in pipeline.py, using event-based splits and MLflow for experiment tracking and model registration. Refactors imports and typing for consistency, and moves configuration constants to config.py for better modularity.

Files changed (5) hide show

src/predict/main.py +13 -2
src/predict/models.py +3 -2
src/predict/pipeline.py +69 -0
src/predict/preprocess.py +8 -9
src/predict/utils.py +2 -8

src/predict/main.py CHANGED Viewed

@@ -55,6 +55,11 @@ def main():
         default=False,
         help="Force retrain all models even if no new data is available."
     )
     args = parser.parse_args()
     # Handle conflicting arguments
@@ -75,9 +80,15 @@ def main():
         use_existing_models=use_existing_models,
         force_retrain=force_retrain
     )
     try:
-        pipeline.run(detailed_report=(args.report == 'detailed'))
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")

         default=False,
         help="Force retrain all models even if no new data is available."
     )
+    parser.add_argument(
+        '--kfold',
+        action='store_true',
+        help='Run 3-fold CV instead of standard split.'
+    )
     args = parser.parse_args()
     # Handle conflicting arguments
         use_existing_models=use_existing_models,
         force_retrain=force_retrain
     )
     try:
+        if args.kfold:
+            cv_results = pipeline.run_kfold_cv(k=3, holdout_events=1)
+            print(cv_results)
+        else:
+            pipeline.run(detailed_report=(args.report == 'detailed'))
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
+if __name__ == '__main__':
+    main()

src/predict/models.py CHANGED Viewed

@@ -12,7 +12,8 @@ from lightgbm import LGBMClassifier
 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 from ..config import FIGHTERS_CSV_PATH
 from .preprocess import preprocess_for_ml, _get_fighter_history_stats
-from .utils import calculate_age, prepare_fighters_data, DEFAULT_ELO
 class BaseModel(ABC):
     """
@@ -87,7 +88,7 @@ class BaseMLModel(BaseModel):
         self.fighters_df = None
         self.fighter_histories = {}
-    def train(self, train_fights: List[Dict[str, Any]]) -> None:
         """
         Trains the machine learning model. This involves loading fighter data,
         pre-calculating histories, and fitting the model on the preprocessed data.

 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 from ..config import FIGHTERS_CSV_PATH
 from .preprocess import preprocess_for_ml, _get_fighter_history_stats
+from .utils import calculate_age, prepare_fighters_data
+from .config import DEFAULT_ELO
 class BaseModel(ABC):
     """
         self.fighters_df = None
         self.fighter_histories = {}
+    def train(self, train_fights: list[dict[str, any]]) -> None:
         """
         Trains the machine learning model. This involves loading fighter data,
         pre-calculating histories, and fitting the model on the preprocessed data.

src/predict/pipeline.py CHANGED Viewed

@@ -25,6 +25,9 @@ import json
 import joblib
 from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
 from .models import BaseModel
 class PredictionPipeline:
     """
@@ -248,6 +251,72 @@ class PredictionPipeline:
         if should_retrain:
             self._train_and_save_models()
     def update_models_if_new_data(self):
         """
         Checks for new data and retrains/saves all models on the full dataset if needed.

 import joblib
 from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
 from .models import BaseModel
+from sklearn.model_selection import KFold
+import mlflow
+import mlflow.sklearn
 class PredictionPipeline:
     """
         if should_retrain:
             self._train_and_save_models()
+    def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
+        """Performs k-fold cross-validation where each fold is a set of events.
+        Within each fold, we keep the last *holdout_events* for testing."""
+        fights = self._load_fights()
+        # Build an ordered list of unique events
+        event_list = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
+        # Initialize KFold splitter on events
+        kf = KFold(n_splits=k, shuffle=True, random_state=42)
+        all_fold_metrics = []
+        for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
+            train_events = [event_list[i] for i in train_event_idx]
+            # Collect fights that belong to the training events
+            fold_fights = [f for f in fights if f['event_name'] in train_events]
+            # Inside this fold, reserve the last `holdout_events` events for testing
+            fold_events_ordered = list(OrderedDict.fromkeys(f['event_name'] for f in fold_fights))
+            test_events = fold_events_ordered[-holdout_events:]
+            train_set = [f for f in fold_fights if f['event_name'] not in test_events]
+            test_set  = [f for f in fold_fights if f['event_name'] in test_events]
+            # Start an MLflow run for the current fold
+            mlflow.set_experiment("UFC_KFold_CV")
+            with mlflow.start_run(run_name=f"fold_{fold_idx}"):
+                # Log meta information about the fold
+                mlflow.log_param("fold", fold_idx)
+                mlflow.log_param("train_events", len(train_events))
+                mlflow.log_param("test_events", holdout_events)
+                fold_results = {}
+                for model in self.models:
+                    model_name = model.__class__.__name__
+                    # Train and evaluate
+                    model.train(train_set)
+                    correct = 0
+                    total_fights = 0
+                    for fight in test_set:
+                        if fight['winner'] not in ["Draw", "NC", ""]:
+                            prediction = model.predict(fight)
+                            if prediction.get('winner') == fight['winner']:
+                                correct += 1
+                            total_fights += 1
+                    acc = correct / total_fights if total_fights > 0 else 0.0
+                    fold_results[model_name] = acc
+                    # Log metrics and register model to appear in MLflow Models tab
+                    mlflow.log_metric(f"accuracy_{model_name}", acc)
+                    mlflow.log_metric(f"total_fights_{model_name}", total_fights)
+                    # Register the model with MLflow to appear in Models tab
+                    mlflow.sklearn.log_model(
+                        model,
+                        f"model_{model_name}",
+                        registered_model_name=f"{model_name}_UFC_Model"
+                    )
+                all_fold_metrics.append(fold_results)
+        return all_fold_metrics
     def update_models_if_new_data(self):
         """
         Checks for new data and retrains/saves all models on the full dataset if needed.

src/predict/preprocess.py CHANGED Viewed

@@ -1,22 +1,21 @@
 import pandas as pd
 import os
 from datetime import datetime
-from typing import Dict, List, Tuple, Any, Optional
-from ..config import FIGHTERS_CSV_PATH
 from .utils import (
     parse_round_time_to_seconds, parse_striking_stats, to_int_safe,
-    calculate_age, prepare_fighters_data, DEFAULT_ELO, N_FIGHTS_HISTORY
 )
 def _get_fighter_history_stats(
     fighter_name: str,
     current_fight_date: datetime,
-    fighter_history: List[Dict[str, Any]],
     fighters_df: pd.DataFrame,
     n: int = N_FIGHTS_HISTORY
-) -> Dict[str, float]:
     """
     Calculates performance statistics for a fighter based on their last n fights.
     """
@@ -82,9 +81,9 @@ def _get_fighter_history_stats(
     }
 def preprocess_for_ml(
-    fights_to_process: List[Dict[str, Any]],
     fighters_csv_path: str
-) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
     """
     Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
     suitable for a binary classification machine learning model.
@@ -135,8 +134,8 @@ def preprocess_for_ml(
         if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
         # Calculate ages for both fighters
-        f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
-        f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
         # Get historical stats for both fighters
         f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)

 import pandas as pd
 import os
 from datetime import datetime
 from .utils import (
     parse_round_time_to_seconds, parse_striking_stats, to_int_safe,
+    calculate_age, prepare_fighters_data
 )
+from .config import DEFAULT_ELO, N_FIGHTS_HISTORY
 def _get_fighter_history_stats(
     fighter_name: str,
     current_fight_date: datetime,
+    fighter_history: list[dict[str, any]],
     fighters_df: pd.DataFrame,
     n: int = N_FIGHTS_HISTORY
+) -> dict[str, float]:
     """
     Calculates performance statistics for a fighter based on their last n fights.
     """
     }
 def preprocess_for_ml(
+    fights_to_process: list[dict[str, any]],
     fighters_csv_path: str
+) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
     """
     Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
     suitable for a binary classification machine learning model.
         if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
         # Calculate ages for both fighters
+        f1_age = calculate_age(f1_stats.get('dob'), fight['event_date'])
+        f2_age = calculate_age(f2_stats.get('dob'), fight['event_date'])
         # Get historical stats for both fighters
         f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)

src/predict/utils.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import pandas as pd
-import os
 from datetime import datetime
-from typing import Optional, Dict, Any
-# Constants
-DEFAULT_ELO = 1500
-DEFAULT_AGE = 0
-DEFAULT_FIGHT_TIME = 0
-DEFAULT_ROUNDS_DURATION = 5 * 60  # 5 minutes per round
-N_FIGHTS_HISTORY = 5
 def clean_numeric_column(series: pd.Series) -> pd.Series:
     """A helper to clean string columns into numbers, handling errors."""

 import pandas as pd
 from datetime import datetime
+from typing import Optional, Any
+from .config import DEFAULT_ROUNDS_DURATION
 def clean_numeric_column(series: pd.Series) -> pd.Series:
     """A helper to clean string columns into numbers, handling errors."""