AlvaroMros commited on
Commit
ffd453e
·
1 Parent(s): 7fcaffe

Revert "(CAREFUL!!!!!!) Refactor argument parsing and prediction pipeline"

Browse files

This reverts commit 7fcaffe1b17b52b1451fe966549d2708dc218ffc.

output/model_results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40c2fb9010bdae4946c2b879d4014aa671a43b586aff7faa73ea4846585e589c
3
- size 11671
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8df1ba9e26fa98e34bfb1c773e66576cbf89152087c55b70921269c84f39d5
3
+ size 27286
output/models/EloBaselineModel.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40937e8b6fe9aaaa1ca92a84e3e67b5bdefcf2700d2cafb7830670a14f684858
3
- size 938435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfdc684f791b598fbecfbfe9b14cca3b4d483b3d7368a16faecea31aace3be87
3
+ size 938419
output/models/LogisticRegressionModel.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51c11a689c50244a6084e642a1dc35a349d515f075b40515dbd4164e7831dfdb
3
- size 5518484
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a773552b7f1b166858ab1ff7bdf472e24b293279a8e24871de773b1a3de46e1
3
+ size 5517988
src/args.py DELETED
@@ -1,97 +0,0 @@
1
- import argparse
2
-
3
- def get_pipeline_args():
4
- """
5
- Parse command line arguments for the main UFC data pipeline.
6
-
7
- Returns:
8
- argparse.Namespace: Parsed command line arguments
9
- """
10
- parser = argparse.ArgumentParser(description="UFC Data Pipeline")
11
-
12
- # Pipeline selection
13
- parser.add_argument(
14
- '--pipeline',
15
- type=str,
16
- default='scrape',
17
- choices=['scrape', 'analysis', 'predict', 'update', 'all'],
18
- help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
19
- )
20
-
21
- # Scraping arguments
22
- scraping_group = parser.add_argument_group('Scraping options')
23
- scraping_group.add_argument(
24
- '--scrape-mode',
25
- type=str,
26
- default='full',
27
- choices=['full', 'update'],
28
- help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
29
- )
30
- scraping_group.add_argument(
31
- '--num-events',
32
- type=int,
33
- default=5,
34
- help="Number of latest events to scrape in update mode (default: 5)"
35
- )
36
-
37
- # Model management arguments
38
- model_group = parser.add_argument_group('Model management')
39
- model_group.add_argument(
40
- '--use-existing-models',
41
- action='store_true',
42
- default=True,
43
- help="Use existing saved models if available and no new data (default: True)"
44
- )
45
- model_group.add_argument(
46
- '--no-use-existing-models',
47
- action='store_true',
48
- default=False,
49
- help="Force retrain all models from scratch, ignoring existing saved models"
50
- )
51
- model_group.add_argument(
52
- '--force-retrain',
53
- action='store_true',
54
- default=False,
55
- help="Force retrain all models even if no new data is available"
56
- )
57
-
58
- return parser.parse_args()
59
-
60
- def get_prediction_args():
61
- """
62
- Parse command line arguments specific to the prediction pipeline.
63
-
64
- Returns:
65
- argparse.Namespace: Parsed command line arguments
66
- """
67
- parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
68
-
69
- parser.add_argument(
70
- '--report',
71
- type=str,
72
- default='detailed',
73
- choices=['detailed', 'summary'],
74
- help="Type of report to generate: 'detailed' (file) or 'summary' (console)"
75
- )
76
-
77
- model_group = parser.add_argument_group('Model management')
78
- model_group.add_argument(
79
- '--use-existing-models',
80
- action='store_true',
81
- default=True,
82
- help="Use existing saved models if available and no new data (default: True)"
83
- )
84
- model_group.add_argument(
85
- '--no-use-existing-models',
86
- action='store_true',
87
- default=False,
88
- help="Force retrain all models from scratch, ignoring existing saved models"
89
- )
90
- model_group.add_argument(
91
- '--force-retrain',
92
- action='store_true',
93
- default=False,
94
- help="Force retrain all models even if no new data is available"
95
- )
96
-
97
- return parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/main.py CHANGED
@@ -1,79 +1,106 @@
 
1
  import sys
2
  import os
3
- from .args import get_pipeline_args
4
-
5
- def run_scraping_pipeline(args):
6
- """Execute the scraping pipeline with given arguments."""
7
- print("=== Running Scraping Pipeline ===")
8
- from .scrape.main import main as scrape_main
9
-
10
- # Pass arguments to scrape.main
11
- original_argv = sys.argv
12
- sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
13
- try:
14
- scrape_main()
15
- finally:
16
- sys.argv = original_argv
17
-
18
- def run_analysis_pipeline():
19
- """Execute the ELO analysis pipeline."""
20
- print("\n=== Running ELO Analysis ===")
21
- from .analysis.elo import main as elo_main
22
- elo_main()
23
-
24
- def run_prediction_pipeline(args):
25
- """Execute the prediction pipeline with given arguments."""
26
- print("\n=== Running Prediction Pipeline ===")
27
- from .predict.main import main as predict_main
28
-
29
- # Pass model management arguments to predict.main
30
- original_argv = sys.argv
31
- predict_args = ['predict_main']
32
-
33
- if args.no_use_existing_models:
34
- predict_args.append('--no-use-existing-models')
35
- elif args.use_existing_models:
36
- predict_args.append('--use-existing-models')
37
-
38
- if args.force_retrain:
39
- predict_args.append('--force-retrain')
40
-
41
- sys.argv = predict_args
42
- try:
43
- predict_main()
44
- finally:
45
- sys.argv = original_argv
46
-
47
- def run_model_update(args):
48
- """Execute the model update pipeline."""
49
- print("\n=== Running Model Update Pipeline ===")
50
- try:
51
- from .predict.main import MODELS_TO_RUN
52
- from .predict.pipeline import PredictionPipeline
53
- except ImportError:
54
- print("Fatal: Could not import prediction modules.")
55
- print("Please ensure your project structure and python path are correct.")
56
- return
57
-
58
- pipeline = PredictionPipeline(models=MODELS_TO_RUN)
59
- pipeline.update_models_if_new_data()
60
 
61
  def main():
62
- """Main entry point for the UFC data pipeline."""
63
- args = get_pipeline_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Execute requested pipeline(s)
66
  if args.pipeline in ['scrape', 'all']:
67
- run_scraping_pipeline(args)
 
 
 
 
 
 
 
 
 
68
 
69
  if args.pipeline in ['analysis', 'all']:
70
- run_analysis_pipeline()
71
-
 
 
72
  if args.pipeline == 'update':
73
- run_model_update(args)
 
 
 
 
 
 
 
 
 
 
74
 
75
  if args.pipeline in ['predict', 'all']:
76
- run_prediction_pipeline(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == '__main__':
79
  main()
 
1
+ import argparse
2
  import sys
3
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def main():
6
+ """
7
+ Main entry point for the UFC data pipeline.
8
+ Supports scraping, analysis, and prediction workflows.
9
+ """
10
+ parser = argparse.ArgumentParser(description="UFC Data Pipeline")
11
+ parser.add_argument(
12
+ '--pipeline',
13
+ type=str,
14
+ default='scrape',
15
+ choices=['scrape', 'analysis', 'predict', 'update', 'all'],
16
+ help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
17
+ )
18
+ parser.add_argument(
19
+ '--scrape-mode',
20
+ type=str,
21
+ default='full',
22
+ choices=['full', 'update'],
23
+ help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
24
+ )
25
+ parser.add_argument(
26
+ '--num-events',
27
+ type=int,
28
+ default=5,
29
+ help="Number of latest events to scrape in update mode (default: 5)"
30
+ )
31
+ # Model management arguments for prediction pipeline
32
+ parser.add_argument(
33
+ '--use-existing-models',
34
+ action='store_true',
35
+ default=True,
36
+ help="Use existing saved models if available and no new data (default: True)."
37
+ )
38
+ parser.add_argument(
39
+ '--no-use-existing-models',
40
+ action='store_true',
41
+ default=False,
42
+ help="Force retrain all models from scratch, ignoring existing saved models."
43
+ )
44
+ parser.add_argument(
45
+ '--force-retrain',
46
+ action='store_true',
47
+ default=False,
48
+ help="Force retrain all models even if no new data is available."
49
+ )
50
+
51
+ args = parser.parse_args()
52
 
 
53
  if args.pipeline in ['scrape', 'all']:
54
+ print("=== Running Scraping Pipeline ===")
55
+ from src.scrape.main import main as scrape_main
56
+
57
+ # Override sys.argv to pass arguments to scrape.main
58
+ original_argv = sys.argv
59
+ sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
60
+ try:
61
+ scrape_main()
62
+ finally:
63
+ sys.argv = original_argv
64
 
65
  if args.pipeline in ['analysis', 'all']:
66
+ print("\n=== Running ELO Analysis ===")
67
+ from src.analysis.elo import main as elo_main
68
+ elo_main()
69
+
70
  if args.pipeline == 'update':
71
+ print("\n=== Running Model Update Pipeline ===")
72
+ try:
73
+ from src.predict.main import MODELS_TO_RUN
74
+ from src.predict.pipeline import PredictionPipeline
75
+ except ImportError:
76
+ print("Fatal: Could not import prediction modules.")
77
+ print("Please ensure your project structure and python path are correct.")
78
+ return
79
+
80
+ pipeline = PredictionPipeline(models=MODELS_TO_RUN)
81
+ pipeline.update_models_if_new_data()
82
 
83
  if args.pipeline in ['predict', 'all']:
84
+ print("\n=== Running Prediction Pipeline ===")
85
+ from src.predict.main import main as predict_main
86
+
87
+ # Override sys.argv to pass model management arguments to predict.main
88
+ original_argv = sys.argv
89
+ predict_args = ['predict_main']
90
+
91
+ if args.no_use_existing_models:
92
+ predict_args.append('--no-use-existing-models')
93
+ elif args.use_existing_models:
94
+ predict_args.append('--use-existing-models')
95
+
96
+ if args.force_retrain:
97
+ predict_args.append('--force-retrain')
98
+
99
+ sys.argv = predict_args
100
+ try:
101
+ predict_main()
102
+ finally:
103
+ sys.argv = original_argv
104
 
105
  if __name__ == '__main__':
106
  main()
src/predict/main.py CHANGED
@@ -1,4 +1,5 @@
1
- from ..args import get_prediction_args
 
2
  from .pipeline import PredictionPipeline
3
  from .models import (
4
  EloBaselineModel,
@@ -10,34 +11,56 @@ from .models import (
10
  LGBMModel
11
  )
12
 
13
- def get_available_models():
14
- """Get a list of all available prediction models.
15
-
16
- Returns:
17
- list: List of instantiated model objects
18
- """
19
- return [
20
- EloBaselineModel(),
21
- LogisticRegressionModel(),
22
- # XGBoostModel(),
23
- # SVCModel(),
24
- # RandomForestModel(),
25
- # BernoulliNBModel(),
26
- LGBMModel(),
27
- ]
28
 
29
  def main():
30
  """
31
  Main entry point to run the prediction pipeline.
32
  You can specify which models to run and the reporting format.
33
  """
34
- args = get_prediction_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Handle conflicting arguments
37
  use_existing_models = not args.no_use_existing_models and args.use_existing_models
38
  force_retrain = args.force_retrain
39
 
40
- # Log model management settings
41
  if args.no_use_existing_models:
42
  print("No-use-existing-models flag set: All models will be retrained from scratch.")
43
  elif force_retrain:
@@ -45,9 +68,21 @@ def main():
45
  elif use_existing_models:
46
  print("Using existing models if available and no new data detected.")
47
 
48
- # Initialize and run prediction pipeline
 
 
 
 
 
 
 
 
 
 
 
 
49
  pipeline = PredictionPipeline(
50
- models=get_available_models(),
51
  use_existing_models=use_existing_models,
52
  force_retrain=force_retrain
53
  )
@@ -57,6 +92,3 @@ def main():
57
  except FileNotFoundError as e:
58
  print(f"Error: {e}")
59
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
60
- except Exception as e:
61
- print(f"An unexpected error occurred: {e}")
62
- raise
 
1
+ import argparse
2
+
3
  from .pipeline import PredictionPipeline
4
  from .models import (
5
  EloBaselineModel,
 
11
  LGBMModel
12
  )
13
 
14
+ # --- Define Models to Run ---
15
+ # Instantiate all the models you want to evaluate here.
16
+ MODELS_TO_RUN = [
17
+ EloBaselineModel(),
18
+ LogisticRegressionModel(),
19
+ XGBoostModel(),
20
+ SVCModel(),
21
+ RandomForestModel(),
22
+ BernoulliNBModel(),
23
+ LGBMModel(),
24
+ ]
25
+ # --- End of Model Definition ---
 
 
 
26
 
27
  def main():
28
  """
29
  Main entry point to run the prediction pipeline.
30
  You can specify which models to run and the reporting format.
31
  """
32
+ parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
33
+ parser.add_argument(
34
+ '--report',
35
+ type=str,
36
+ default='detailed',
37
+ choices=['detailed', 'summary'],
38
+ help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
39
+ )
40
+ parser.add_argument(
41
+ '--use-existing-models',
42
+ action='store_true',
43
+ default=True,
44
+ help="Use existing saved models if available and no new data (default: True)."
45
+ )
46
+ parser.add_argument(
47
+ '--no-use-existing-models',
48
+ action='store_true',
49
+ default=False,
50
+ help="Force retrain all models from scratch, ignoring existing saved models."
51
+ )
52
+ parser.add_argument(
53
+ '--force-retrain',
54
+ action='store_true',
55
+ default=False,
56
+ help="Force retrain all models even if no new data is available."
57
+ )
58
+ args = parser.parse_args()
59
 
60
  # Handle conflicting arguments
61
  use_existing_models = not args.no_use_existing_models and args.use_existing_models
62
  force_retrain = args.force_retrain
63
 
 
64
  if args.no_use_existing_models:
65
  print("No-use-existing-models flag set: All models will be retrained from scratch.")
66
  elif force_retrain:
 
68
  elif use_existing_models:
69
  print("Using existing models if available and no new data detected.")
70
 
71
+ # --- Define Models to Run ---
72
+ # Instantiate all the models you want to evaluate here.
73
+ models_to_run = [
74
+ EloBaselineModel(),
75
+ LogisticRegressionModel(),
76
+ XGBoostModel(),
77
+ SVCModel(),
78
+ RandomForestModel(),
79
+ BernoulliNBModel(),
80
+ LGBMModel(),
81
+ ]
82
+ # --- End of Model Definition ---
83
+
84
  pipeline = PredictionPipeline(
85
+ models=MODELS_TO_RUN,
86
  use_existing_models=use_existing_models,
87
  force_retrain=force_retrain
88
  )
 
92
  except FileNotFoundError as e:
93
  print(f"Error: {e}")
94
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
 
 
 
src/predict/models.py CHANGED
@@ -1,4 +1,6 @@
1
  from abc import ABC, abstractmethod
 
 
2
  import pandas as pd
3
  from sklearn.linear_model import LogisticRegression
4
  from sklearn.svm import SVC
@@ -8,128 +10,188 @@ from xgboost import XGBClassifier
8
  from lightgbm import LGBMClassifier
9
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
10
  from ..config import FIGHTERS_CSV_PATH
11
- from .preprocess import preprocess_for_ml
12
 
13
  class BaseModel(ABC):
14
- """Abstract base class for all prediction models."""
15
-
16
- def __init__(self):
17
- self.model_name = self.__class__.__name__
18
-
19
  @abstractmethod
20
  def train(self, train_fights):
21
- """Train the model using historical fight data."""
 
 
 
 
22
  pass
23
 
24
  @abstractmethod
25
  def predict(self, fight):
26
- """Predict the winner of a single fight."""
27
- pass
28
 
29
- def _format_prediction(self, winner, probability):
30
- """Format prediction results consistently."""
31
- return {'winner': winner, 'probability': probability}
 
32
 
33
  class EloBaselineModel(BaseModel):
34
- """Simple ELO-based prediction model."""
35
-
 
 
 
 
36
  def train(self, train_fights):
37
- """Process historical fights to calculate current ELO ratings."""
38
- print(f"--- Training {self.model_name} ---")
39
-
40
- # Load and prepare fighter data
 
41
  self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
42
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
43
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
44
-
45
- # Calculate ELO ratings
46
- elo_ratings = process_fights_for_elo(train_fights)
47
- self.fighters_df['elo'] = pd.Series(elo_ratings)
48
- self.fighters_df['elo'] = self.fighters_df['elo'].fillna(INITIAL_ELO)
49
-
50
- print("ELO ratings calculated for all fighters.")
51
 
52
  def predict(self, fight):
53
- """Predict winner based on current ELO ratings."""
54
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
55
 
56
  try:
57
  f1_elo = self.fighters_df.loc[f1_name, 'elo']
58
  f2_elo = self.fighters_df.loc[f2_name, 'elo']
59
 
60
- # Calculate win probability using ELO formula
61
  prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
62
-
63
- winner = f1_name if prob_f1_wins >= 0.5 else f2_name
64
- probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
65
-
66
- return self._format_prediction(winner, probability)
67
-
68
  except KeyError as e:
69
  print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
70
- return self._format_prediction(None, None)
71
 
72
  class BaseMLModel(BaseModel):
73
- """Base class for all machine learning models."""
74
-
 
 
75
  def __init__(self, model):
76
- super().__init__()
77
  if model is None:
78
  raise ValueError("A model must be provided.")
79
  self.model = model
 
 
80
 
81
  def train(self, train_fights):
82
- """Train the ML model on preprocessed fight data."""
83
- print(f"--- Training {self.model_name} ---")
 
 
 
84
 
85
- # Preprocess data and fit model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
87
  print(f"Fitting model on {X_train.shape[0]} samples...")
88
  self.model.fit(X_train, y_train)
89
  print("Model training complete.")
90
 
91
  def predict(self, fight):
92
- """Predict fight outcome using the trained ML model."""
93
- # Preprocess single fight for prediction
94
- X_pred, _, metadata = preprocess_for_ml([fight], FIGHTERS_CSV_PATH)
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- if X_pred.empty:
97
- print(f"Warning: Could not process fight data for {fight['fighter_1']} vs {fight['fighter_2']}")
98
- return self._format_prediction(None, None)
 
99
 
100
- # Make prediction
101
- try:
102
- prob_f1_wins = self.model.predict_proba(X_pred)[0][1]
103
- winner = fight['fighter_1'] if prob_f1_wins >= 0.5 else fight['fighter_2']
104
- probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
105
-
106
- return self._format_prediction(winner, probability)
107
-
108
- except Exception as e:
109
- print(f"Error making prediction: {e}")
110
- return self._format_prediction(None, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Concrete ML model implementations
113
  class LogisticRegressionModel(BaseMLModel):
 
114
  def __init__(self):
115
- super().__init__(LogisticRegression(random_state=42))
 
 
 
 
 
 
116
 
117
  class SVCModel(BaseMLModel):
 
118
  def __init__(self):
119
- super().__init__(SVC(probability=True, random_state=42))
 
120
 
121
  class RandomForestModel(BaseMLModel):
 
122
  def __init__(self):
123
- super().__init__(RandomForestClassifier(n_estimators=100, random_state=42))
124
 
125
  class BernoulliNBModel(BaseMLModel):
 
126
  def __init__(self):
127
- super().__init__(BernoulliNB())
128
-
129
- class XGBoostModel(BaseMLModel):
130
- def __init__(self):
131
- super().__init__(XGBClassifier(random_state=42))
132
 
133
  class LGBMModel(BaseMLModel):
 
134
  def __init__(self):
135
- super().__init__(LGBMClassifier(random_state=42))
 
1
  from abc import ABC, abstractmethod
2
+ import sys
3
+ import os
4
  import pandas as pd
5
  from sklearn.linear_model import LogisticRegression
6
  from sklearn.svm import SVC
 
10
  from lightgbm import LGBMClassifier
11
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
12
  from ..config import FIGHTERS_CSV_PATH
13
+ from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
14
 
15
  class BaseModel(ABC):
16
+ """
17
+ Abstract base class for all prediction models.
18
+ Ensures that every model has a standard interface for training and prediction.
19
+ """
 
20
  @abstractmethod
21
  def train(self, train_fights):
22
+ """
23
+ Trains or prepares the model using historical fight data.
24
+
25
+ :param train_fights: A list of historical fight data dictionaries.
26
+ """
27
  pass
28
 
29
  @abstractmethod
30
  def predict(self, fight):
31
+ """
32
+ Predicts the winner of a single fight.
33
 
34
+ :param fight: A dictionary representing a single fight.
35
+ :return: The name of the predicted winning fighter.
36
+ """
37
+ pass
38
 
39
  class EloBaselineModel(BaseModel):
40
+ """
41
+ A baseline prediction model that predicts the winner based on the higher ELO rating.
42
+ """
43
+ def __init__(self):
44
+ self.fighters_df = None
45
+
46
  def train(self, train_fights):
47
+ """
48
+ For the ELO baseline, 'training' simply consists of loading the fighter data
49
+ to access their ELO scores during prediction.
50
+ """
51
+ print("Training EloBaselineModel: Loading fighter ELO data...")
52
  self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
53
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
54
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
 
 
 
 
 
 
 
55
 
56
  def predict(self, fight):
57
+ """Predicts the winner based on ELO and calculates win probability."""
58
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
59
 
60
  try:
61
  f1_elo = self.fighters_df.loc[f1_name, 'elo']
62
  f2_elo = self.fighters_df.loc[f2_name, 'elo']
63
 
64
+ # Calculate win probability for fighter 1 using the ELO formula
65
  prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
66
+
67
+ if prob_f1_wins >= 0.5:
68
+ return {'winner': f1_name, 'probability': prob_f1_wins}
69
+ else:
70
+ return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
71
+
72
  except KeyError as e:
73
  print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
74
+ return {'winner': None, 'probability': None}
75
 
76
  class BaseMLModel(BaseModel):
77
+ """
78
+ An abstract base class for machine learning models that handles all common
79
+ data preparation, training, and prediction logic.
80
+ """
81
  def __init__(self, model):
 
82
  if model is None:
83
  raise ValueError("A model must be provided.")
84
  self.model = model
85
+ self.fighters_df = None
86
+ self.fighter_histories = {}
87
 
88
  def train(self, train_fights):
89
+ """
90
+ Trains the machine learning model. This involves loading fighter data,
91
+ pre-calculating histories, and fitting the model on the preprocessed data.
92
+ """
93
+ print(f"--- Training {self.model.__class__.__name__} ---")
94
 
95
+ # 1. Prepare data for prediction-time feature generation
96
+ self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
97
+ self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
98
+ self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
99
+ for col in ['height_cm', 'reach_in', 'elo']:
100
+ if col in self.fighters_df.columns:
101
+ self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
102
+
103
+ # 2. Pre-calculate fighter histories
104
+ train_fights_with_dates = []
105
+ for fight in train_fights:
106
+ fight['date_obj'] = pd.to_datetime(fight['event_date'])
107
+ train_fights_with_dates.append(fight)
108
+ for fighter_name in self.fighters_df.index:
109
+ history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
110
+ self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
111
+
112
+ # 3. Preprocess and fit
113
  X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
114
  print(f"Fitting model on {X_train.shape[0]} samples...")
115
  self.model.fit(X_train, y_train)
116
  print("Model training complete.")
117
 
118
  def predict(self, fight):
119
+ """
120
+ Predicts the outcome of a single fight, returning the winner and probability.
121
+ """
122
+ f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
123
+ fight_date = pd.to_datetime(fight['event_date'])
124
+
125
+ if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
126
+ print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
127
+ return {'winner': None, 'probability': None}
128
+
129
+ f1_stats = self.fighters_df.loc[f1_name]
130
+ f2_stats = self.fighters_df.loc[f2_name]
131
+ if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
132
+ if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
133
 
134
+ f1_hist = self.fighter_histories.get(f1_name, [])
135
+ f2_hist = self.fighter_histories.get(f2_name, [])
136
+ f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
137
+ f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
138
 
139
+ f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
140
+ f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
141
+
142
+ features = {
143
+ 'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
144
+ 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
145
+ 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
146
+ 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
147
+ 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
148
+ 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
149
+ 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
150
+ 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
151
+ 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
152
+ 'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
153
+ 'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
154
+ }
155
+
156
+ feature_vector = pd.DataFrame([features]).fillna(0)
157
+
158
+ # Use predict_proba to get probabilities for each class
159
+ probabilities = self.model.predict_proba(feature_vector)[0]
160
+ prob_f1_wins = probabilities[1] # Probability of class '1' (fighter 1 wins)
161
+
162
+ if prob_f1_wins >= 0.5:
163
+ return {'winner': f1_name, 'probability': prob_f1_wins}
164
+ else:
165
+ return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
166
 
 
167
  class LogisticRegressionModel(BaseMLModel):
168
+ """A thin wrapper for scikit-learn's LogisticRegression."""
169
  def __init__(self):
170
+ super().__init__(model=LogisticRegression())
171
+
172
+ class XGBoostModel(BaseMLModel):
173
+ """A thin wrapper for XGBoost's XGBClassifier."""
174
+ def __init__(self):
175
+ model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
176
+ super().__init__(model=model)
177
 
178
  class SVCModel(BaseMLModel):
179
+ """A thin wrapper for scikit-learn's Support Vector Classifier."""
180
  def __init__(self):
181
+ # Probability=True is needed for some reports, though it slows down training
182
+ super().__init__(model=SVC(probability=True, random_state=42))
183
 
184
  class RandomForestModel(BaseMLModel):
185
+ """A thin wrapper for scikit-learn's RandomForestClassifier."""
186
  def __init__(self):
187
+ super().__init__(model=RandomForestClassifier(random_state=42))
188
 
189
  class BernoulliNBModel(BaseMLModel):
190
+ """A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
191
  def __init__(self):
192
+ super().__init__(model=BernoulliNB())
 
 
 
 
193
 
194
  class LGBMModel(BaseMLModel):
195
+ """A thin wrapper for LightGBM's LGBMClassifier."""
196
  def __init__(self):
197
+ super().__init__(model=LGBMClassifier(random_state=42))
src/predict/preprocess.py CHANGED
@@ -1,14 +1,15 @@
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
 
4
 
5
  def _clean_numeric_column(series):
6
- """Clean string columns into numbers, handling errors."""
7
  series_str = series.astype(str)
8
  return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
9
 
10
  def _calculate_age(dob_str, fight_date_str):
11
- """Calculate age in years from date of birth and fight date strings."""
12
  if pd.isna(dob_str) or not dob_str:
13
  return None
14
  try:
@@ -18,235 +19,213 @@ def _calculate_age(dob_str, fight_date_str):
18
  except (ValueError, TypeError):
19
  return None
20
 
21
- def _get_days_since_last_fight(current_date, past_fights):
22
- """Calculate days since a fighter's last fight."""
23
- if not past_fights:
24
- return None
25
- last_fight_date = past_fights[-1]['date_obj']
26
- return (current_date - last_fight_date).days
27
-
28
- def _get_win_streak(fighter_name, current_date, past_fights):
29
- """Calculate current win streak before a given date."""
30
- streak = 0
31
- for fight in reversed(past_fights):
32
- if fight['date_obj'] >= current_date:
33
- continue
34
- if fight['winner'] == fighter_name:
35
- streak += 1
36
- else:
37
- break
38
- return streak
39
 
40
- def _to_int_safe(value):
41
- """Safely convert a value to integer, returning 0 for invalid values."""
 
 
42
  try:
43
- return int(float(value)) if value and not pd.isna(value) else 0
 
44
  except (ValueError, TypeError):
45
  return 0
46
 
47
- def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5):
48
- """Calculate historical performance statistics for a fighter."""
49
- # Sort fights by date and get last N fights before current fight
50
- past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date]
51
- past_fights = sorted(past_fights, key=lambda x: x['date_obj'])
52
- last_n_fights = past_fights[-n_fights:] if past_fights else []
53
-
 
 
 
 
 
 
 
 
 
 
 
54
  stats = {
55
- 'wins_last_n': 0,
56
- 'ko_wins': 0,
57
- 'total_finishes': 0,
58
- 'first_round_finishes': 0,
59
- 'knockdowns_scored': 0,
60
- 'knockdowns_absorbed': 0,
61
- 'sig_str_landed': 0,
62
- 'sig_str_attempted': 0,
63
- 'takedowns_landed': 0,
64
- 'takedowns_attempted': 0,
65
- 'sub_attempts': 0,
66
- 'ctrl_time_sec': 0,
67
- 'total_fight_time_sec': 0,
68
- 'fights_in_last_year': 0,
69
- 'avg_opp_elo_last_n': 0
70
  }
71
-
72
- # Calculate fights in last year
73
- one_year_ago = current_fight_date - pd.Timedelta(days=365)
74
- stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
75
-
76
- # Process each fight
77
- total_opp_elo = 0
78
  for fight in last_n_fights:
79
  is_fighter_1 = (fight['fighter_1'] == fighter_name)
80
- f_prefix = 'f1' if is_fighter_1 else 'f2'
81
- opp_prefix = 'f2' if is_fighter_1 else 'f1'
82
  opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
83
 
84
- # Win/Loss and Finishes
 
85
  if fight['winner'] == fighter_name:
86
- stats['wins_last_n'] += 1
87
- if fight['method'] != 'Decision':
88
- stats['total_finishes'] += 1
89
- if fight['round'] == '1':
90
- stats['first_round_finishes'] += 1
91
  if 'KO' in fight['method']:
92
  stats['ko_wins'] += 1
 
 
 
 
93
 
94
- # Striking and Grappling Stats
95
- stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
96
- stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
97
- stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
98
- stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
99
- stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
100
- stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
101
- stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
102
-
103
- # Control Time
104
- ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
105
- if isinstance(ctrl_time, str) and ':' in ctrl_time:
106
- mins, secs = map(int, ctrl_time.split(':'))
107
- stats['ctrl_time_sec'] += mins * 60 + secs
108
 
109
- # Fight Duration
110
- round_num = _to_int_safe(fight['round'])
111
- round_time = fight.get('round_time', '0:00')
112
- if isinstance(round_time, str) and ':' in round_time:
113
- mins, secs = map(int, round_time.split(':'))
114
- stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs
 
 
115
 
116
- # Opponent ELO
117
- if opponent_name in fighters_df.index:
118
- opp_elo = fighters_df.loc[opponent_name, 'elo']
119
- if not pd.isna(opp_elo):
120
- total_opp_elo += opp_elo
121
-
122
- # Calculate averages and rates
123
- n_actual_fights = len(last_n_fights)
124
-
125
- # Always provide all required keys with default values
126
- stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
127
- stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
128
- stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
129
- stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
130
- stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
131
- stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
132
-
133
- # Per-minute stats
134
- total_mins = stats['total_fight_time_sec'] / 60
135
- stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
136
- stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
137
- stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
138
- stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
139
-
140
- # Accuracy stats
141
- stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
142
- stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
143
- stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
144
 
145
- return stats
 
 
 
 
 
 
 
146
 
147
  def preprocess_for_ml(fights_to_process, fighters_csv_path):
148
- """Transform fight data into ML-ready features."""
 
 
 
 
 
 
 
 
 
 
 
 
149
  if not os.path.exists(fighters_csv_path):
150
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
151
 
152
- # Load and prepare fighter data
153
  fighters_df = pd.read_csv(fighters_csv_path)
154
- fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
155
- fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
156
 
 
 
 
 
 
 
 
 
157
  for col in ['height_cm', 'reach_in', 'elo']:
158
- if col in fighters_df.columns:
159
- fighters_df[col] = _clean_numeric_column(fighters_df[col])
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Process fights and calculate features
162
- processed_fights = []
 
 
 
 
 
 
 
 
163
  for fight in fights_to_process:
 
164
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
165
-
166
- # Skip if either fighter is missing
167
- if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
168
  continue
169
-
170
- # Get fighter stats
171
- f1_stats = fighters_df.loc[f1_name]
172
- f2_stats = fighters_df.loc[f2_name]
173
-
174
- # Calculate fight date and ensure date_obj is available
175
- fight_date = pd.to_datetime(fight['event_date'])
176
- fight['date_obj'] = fight_date
177
-
178
- # Get fighter histories and ensure date_obj is available for all fights
179
- f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
180
- f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
181
-
182
- # Ensure date_obj is available for all historical fights
183
- for hist_fight in f1_hist + f2_hist:
184
- if 'date_obj' not in hist_fight:
185
- hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
186
-
187
- # Calculate historical stats
188
- f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
189
- f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
190
 
191
- # Calculate ages
 
 
 
192
  f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
193
  f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
 
 
 
 
194
 
195
- # Calculate days since last fight
196
- f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547 # ~1.5 years if no previous fights
197
- f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547
198
-
199
- # Calculate win streaks
200
- f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
201
- f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
202
-
203
- # Compile all features
204
- feature_dict = {
205
- 'winner': 1 if fight.get('winner') == f1_name else 0,
206
- 'date': fight['event_date'],
207
- 'fighter_1': f1_name,
208
- 'fighter_2': f2_name,
209
-
210
- # Physical differences
211
- 'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
212
- 'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
213
- 'age_diff': (f1_age or 0) - (f2_age or 0),
214
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
215
-
216
- # Career momentum
217
- 'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last,
218
- 'win_streak_diff': f1_win_streak - f2_win_streak,
219
- 'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'],
220
-
221
- # Performance differences
222
- 'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'],
223
- 'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
224
- 'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
225
- 'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
226
- 'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
227
- 'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
228
-
229
- # Defense differences
230
- 'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
231
- 'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
232
- 'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
233
  }
 
 
 
 
 
 
 
 
234
 
235
- processed_fights.append(feature_dict)
236
-
237
- if not processed_fights:
238
- return pd.DataFrame(), pd.Series(), pd.DataFrame()
239
-
240
- # Create final dataframes
241
- df = pd.DataFrame(processed_fights)
242
- metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']]
243
-
244
- # Prepare X and y
245
- y = df['winner']
246
- X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2'])
247
- X = X.reindex(sorted(X.columns), axis=1) # Ensure consistent column order
248
-
249
- # Handle missing values by filling NaNs with 0
250
- X = X.fillna(0)
251
-
252
  return X, y, metadata
 
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
4
+ from ..config import FIGHTERS_CSV_PATH
5
 
6
  def _clean_numeric_column(series):
7
+ """A helper to clean string columns into numbers, handling errors."""
8
  series_str = series.astype(str)
9
  return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
10
 
11
  def _calculate_age(dob_str, fight_date_str):
12
+ """Calculates age in years from a date of birth string and fight date string."""
13
  if pd.isna(dob_str) or not dob_str:
14
  return None
15
  try:
 
19
  except (ValueError, TypeError):
20
  return None
21
 
22
+ def _parse_round_time_to_seconds(round_str, time_str):
23
+ """Converts fight duration from round and time to total seconds."""
24
+ try:
25
+ rounds = int(round_str)
26
+ minutes, seconds = map(int, time_str.split(':'))
27
+ # Assuming 5-minute rounds for calculation simplicity
28
+ return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
29
+ except (ValueError, TypeError, AttributeError):
30
+ return 0
31
+
32
+ def _parse_striking_stats(stat_str):
33
+ """Parses striking stats string like '10 of 20' into (landed, attempted)."""
34
+ try:
35
+ landed, attempted = map(int, stat_str.split(' of '))
36
+ return landed, attempted
37
+ except (ValueError, TypeError, AttributeError):
38
+ return 0, 0
 
39
 
40
+ def _to_int_safe(val):
41
+ """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
42
+ if pd.isna(val):
43
+ return 0
44
  try:
45
+ # handle strings with whitespace or empty strings
46
+ return int(str(val).strip() or 0)
47
  except (ValueError, TypeError):
48
  return 0
49
 
50
+ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
51
+ """
52
+ Calculates performance statistics for a fighter based on their last n fights.
53
+ """
54
+ past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
55
+ last_n_fights = past_fights[-n:]
56
+
57
+ if not last_n_fights:
58
+ # Return a default dictionary with the correct keys for a fighter with no history
59
+ return {
60
+ 'wins_last_n': 0,
61
+ 'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
62
+ 'ko_percent_last_n': 0,
63
+ 'sig_str_landed_per_min_last_n': 0,
64
+ 'takedown_accuracy_last_n': 0,
65
+ 'sub_attempts_per_min_last_n': 0,
66
+ }
67
+
68
  stats = {
69
+ 'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
70
+ 'sig_str_landed': 0, 'opponent_elos': [],
71
+ 'td_landed': 0, 'td_attempted': 0, 'sub_attempts': 0
 
 
 
 
 
 
 
 
 
 
 
 
72
  }
73
+
 
 
 
 
 
 
74
  for fight in last_n_fights:
75
  is_fighter_1 = (fight['fighter_1'] == fighter_name)
 
 
76
  opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
77
 
78
+ f_prefix = 'f1' if is_fighter_1 else 'f2'
79
+
80
  if fight['winner'] == fighter_name:
81
+ stats['wins'] += 1
 
 
 
 
82
  if 'KO' in fight['method']:
83
  stats['ko_wins'] += 1
84
+
85
+ if opponent_name in fighters_df.index:
86
+ opp_elo = fighters_df.loc[opponent_name, 'elo']
87
+ stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
88
 
89
+ stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
92
+ landed, _ = _parse_striking_stats(sig_str_stat)
93
+ stats['sig_str_landed'] += landed
94
+
95
+ td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
96
+ td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
97
+ stats['td_landed'] += td_landed
98
+ stats['td_attempted'] += td_attempted
99
 
100
+ stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))
101
+
102
+ # Final calculations
103
+ avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
104
+ total_minutes = stats['total_time_secs'] / 60 if stats['total_time_secs'] > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ return {
107
+ 'wins_last_n': stats['wins'],
108
+ 'avg_opp_elo_last_n': avg_opp_elo,
109
+ 'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
110
+ 'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] / total_minutes) if total_minutes > 0 else 0,
111
+ 'takedown_accuracy_last_n': (stats['td_landed'] / stats['td_attempted']) if stats['td_attempted'] > 0 else 0,
112
+ 'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
113
+ }
114
 
115
  def preprocess_for_ml(fights_to_process, fighters_csv_path):
116
+ """
117
+ Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
118
+ suitable for a binary classification machine learning model.
119
+
120
+ Args:
121
+ fights_to_process (list of dict): The list of fights to process.
122
+ fighters_csv_path (str): Path to the CSV file with all fighter stats.
123
+
124
+ Returns:
125
+ pd.DataFrame: Feature matrix X.
126
+ pd.Series: Target vector y.
127
+ pd.DataFrame: Metadata DataFrame.
128
+ """
129
  if not os.path.exists(fighters_csv_path):
130
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
131
 
 
132
  fighters_df = pd.read_csv(fighters_csv_path)
 
 
133
 
134
+ # 1. Prepare fighters data for merging
135
+ fighters_prepared = fighters_df.copy()
136
+ fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
137
+
138
+ # Handle duplicate fighter names by keeping the first entry
139
+ fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
140
+ fighters_prepared = fighters_prepared.set_index('full_name')
141
+
142
  for col in ['height_cm', 'reach_in', 'elo']:
143
+ if col in fighters_prepared.columns:
144
+ fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
145
+
146
+ # 2. Pre-calculate fighter histories to speed up lookups
147
+ # And convert date strings to datetime objects once
148
+ for fight in fights_to_process:
149
+ try:
150
+ # This will work if event_date is a string
151
+ fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
152
+ except TypeError:
153
+ # This will be triggered if it's already a date-like object (e.g., Timestamp)
154
+ fight['date_obj'] = fight['event_date']
155
 
156
+ fighter_histories = {}
157
+ for fighter_name in fighters_prepared.index:
158
+ history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
159
+ fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
160
+
161
+ # 3. Process fights to create features and targets
162
+ feature_list = []
163
+ target_list = []
164
+ metadata_list = []
165
+
166
  for fight in fights_to_process:
167
+ # Per the dataset's design, fighter_1 is always the winner.
168
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
169
+
170
+ if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
 
171
  continue
172
+
173
+ f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
176
+ if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
177
+
178
+ # Calculate ages for both fighters
179
  f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
180
  f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
181
+
182
+ # Get historical stats for both fighters
183
+ f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
184
+ f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
185
 
186
+ # --- Create two training examples from each fight for a balanced dataset ---
187
+
188
+ # 1. The "Win" case: (fighter_1 - fighter_2)
189
+ features_win = {
190
+ # Original diffs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
192
+ 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
193
+ 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
194
+ 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
195
+ 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
196
+ # New historical diffs
197
+ 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
198
+ 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
199
+ 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
200
+ 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
201
+ # Grappling features
202
+ 'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
203
+ 'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
 
 
 
 
 
 
204
  }
205
+ feature_list.append(features_win)
206
+ target_list.append(1) # 1 represents a win
207
+
208
+ # 2. The "Loss" case: (fighter_2 - fighter_1)
209
+ # We invert the differences for the losing case.
210
+ features_loss = {key: -value for key, value in features_win.items()}
211
+ # Stance difference is symmetric; it doesn't get inverted.
212
+ features_loss['stance_is_different'] = features_win['stance_is_different']
213
 
214
+ feature_list.append(features_loss)
215
+ target_list.append(0) # 0 represents a loss
216
+
217
+ # Add metadata for both generated samples
218
+ # The 'winner' and 'loser' are consistent with the original data structure
219
+ metadata_list.append({
220
+ 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
221
+ })
222
+ metadata_list.append({
223
+ 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
224
+ })
225
+
226
+ X = pd.DataFrame(feature_list).fillna(0)
227
+ y = pd.Series(target_list, name='winner')
228
+ metadata = pd.DataFrame(metadata_list)
229
+
230
+ print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
231
  return X, y, metadata