Spaces:
Sleeping
Sleeping
Commit
·
ffd453e
1
Parent(s):
7fcaffe
Revert "(CAREFUL!!!!!!) Refactor argument parsing and prediction pipeline"
Browse filesThis reverts commit 7fcaffe1b17b52b1451fe966549d2708dc218ffc.
- output/model_results.json +2 -2
- output/models/EloBaselineModel.joblib +2 -2
- output/models/LogisticRegressionModel.joblib +2 -2
- src/args.py +0 -97
- src/main.py +92 -65
- src/predict/main.py +55 -23
- src/predict/models.py +129 -67
- src/predict/preprocess.py +175 -196
output/model_results.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf8df1ba9e26fa98e34bfb1c773e66576cbf89152087c55b70921269c84f39d5
|
3 |
+
size 27286
|
output/models/EloBaselineModel.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfdc684f791b598fbecfbfe9b14cca3b4d483b3d7368a16faecea31aace3be87
|
3 |
+
size 938419
|
output/models/LogisticRegressionModel.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a773552b7f1b166858ab1ff7bdf472e24b293279a8e24871de773b1a3de46e1
|
3 |
+
size 5517988
|
src/args.py
DELETED
@@ -1,97 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
|
3 |
-
def get_pipeline_args():
|
4 |
-
"""
|
5 |
-
Parse command line arguments for the main UFC data pipeline.
|
6 |
-
|
7 |
-
Returns:
|
8 |
-
argparse.Namespace: Parsed command line arguments
|
9 |
-
"""
|
10 |
-
parser = argparse.ArgumentParser(description="UFC Data Pipeline")
|
11 |
-
|
12 |
-
# Pipeline selection
|
13 |
-
parser.add_argument(
|
14 |
-
'--pipeline',
|
15 |
-
type=str,
|
16 |
-
default='scrape',
|
17 |
-
choices=['scrape', 'analysis', 'predict', 'update', 'all'],
|
18 |
-
help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
|
19 |
-
)
|
20 |
-
|
21 |
-
# Scraping arguments
|
22 |
-
scraping_group = parser.add_argument_group('Scraping options')
|
23 |
-
scraping_group.add_argument(
|
24 |
-
'--scrape-mode',
|
25 |
-
type=str,
|
26 |
-
default='full',
|
27 |
-
choices=['full', 'update'],
|
28 |
-
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
|
29 |
-
)
|
30 |
-
scraping_group.add_argument(
|
31 |
-
'--num-events',
|
32 |
-
type=int,
|
33 |
-
default=5,
|
34 |
-
help="Number of latest events to scrape in update mode (default: 5)"
|
35 |
-
)
|
36 |
-
|
37 |
-
# Model management arguments
|
38 |
-
model_group = parser.add_argument_group('Model management')
|
39 |
-
model_group.add_argument(
|
40 |
-
'--use-existing-models',
|
41 |
-
action='store_true',
|
42 |
-
default=True,
|
43 |
-
help="Use existing saved models if available and no new data (default: True)"
|
44 |
-
)
|
45 |
-
model_group.add_argument(
|
46 |
-
'--no-use-existing-models',
|
47 |
-
action='store_true',
|
48 |
-
default=False,
|
49 |
-
help="Force retrain all models from scratch, ignoring existing saved models"
|
50 |
-
)
|
51 |
-
model_group.add_argument(
|
52 |
-
'--force-retrain',
|
53 |
-
action='store_true',
|
54 |
-
default=False,
|
55 |
-
help="Force retrain all models even if no new data is available"
|
56 |
-
)
|
57 |
-
|
58 |
-
return parser.parse_args()
|
59 |
-
|
60 |
-
def get_prediction_args():
|
61 |
-
"""
|
62 |
-
Parse command line arguments specific to the prediction pipeline.
|
63 |
-
|
64 |
-
Returns:
|
65 |
-
argparse.Namespace: Parsed command line arguments
|
66 |
-
"""
|
67 |
-
parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
|
68 |
-
|
69 |
-
parser.add_argument(
|
70 |
-
'--report',
|
71 |
-
type=str,
|
72 |
-
default='detailed',
|
73 |
-
choices=['detailed', 'summary'],
|
74 |
-
help="Type of report to generate: 'detailed' (file) or 'summary' (console)"
|
75 |
-
)
|
76 |
-
|
77 |
-
model_group = parser.add_argument_group('Model management')
|
78 |
-
model_group.add_argument(
|
79 |
-
'--use-existing-models',
|
80 |
-
action='store_true',
|
81 |
-
default=True,
|
82 |
-
help="Use existing saved models if available and no new data (default: True)"
|
83 |
-
)
|
84 |
-
model_group.add_argument(
|
85 |
-
'--no-use-existing-models',
|
86 |
-
action='store_true',
|
87 |
-
default=False,
|
88 |
-
help="Force retrain all models from scratch, ignoring existing saved models"
|
89 |
-
)
|
90 |
-
model_group.add_argument(
|
91 |
-
'--force-retrain',
|
92 |
-
action='store_true',
|
93 |
-
default=False,
|
94 |
-
help="Force retrain all models even if no new data is available"
|
95 |
-
)
|
96 |
-
|
97 |
-
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/main.py
CHANGED
@@ -1,79 +1,106 @@
|
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
-
from .args import get_pipeline_args
|
4 |
-
|
5 |
-
def run_scraping_pipeline(args):
|
6 |
-
"""Execute the scraping pipeline with given arguments."""
|
7 |
-
print("=== Running Scraping Pipeline ===")
|
8 |
-
from .scrape.main import main as scrape_main
|
9 |
-
|
10 |
-
# Pass arguments to scrape.main
|
11 |
-
original_argv = sys.argv
|
12 |
-
sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
|
13 |
-
try:
|
14 |
-
scrape_main()
|
15 |
-
finally:
|
16 |
-
sys.argv = original_argv
|
17 |
-
|
18 |
-
def run_analysis_pipeline():
|
19 |
-
"""Execute the ELO analysis pipeline."""
|
20 |
-
print("\n=== Running ELO Analysis ===")
|
21 |
-
from .analysis.elo import main as elo_main
|
22 |
-
elo_main()
|
23 |
-
|
24 |
-
def run_prediction_pipeline(args):
|
25 |
-
"""Execute the prediction pipeline with given arguments."""
|
26 |
-
print("\n=== Running Prediction Pipeline ===")
|
27 |
-
from .predict.main import main as predict_main
|
28 |
-
|
29 |
-
# Pass model management arguments to predict.main
|
30 |
-
original_argv = sys.argv
|
31 |
-
predict_args = ['predict_main']
|
32 |
-
|
33 |
-
if args.no_use_existing_models:
|
34 |
-
predict_args.append('--no-use-existing-models')
|
35 |
-
elif args.use_existing_models:
|
36 |
-
predict_args.append('--use-existing-models')
|
37 |
-
|
38 |
-
if args.force_retrain:
|
39 |
-
predict_args.append('--force-retrain')
|
40 |
-
|
41 |
-
sys.argv = predict_args
|
42 |
-
try:
|
43 |
-
predict_main()
|
44 |
-
finally:
|
45 |
-
sys.argv = original_argv
|
46 |
-
|
47 |
-
def run_model_update(args):
|
48 |
-
"""Execute the model update pipeline."""
|
49 |
-
print("\n=== Running Model Update Pipeline ===")
|
50 |
-
try:
|
51 |
-
from .predict.main import MODELS_TO_RUN
|
52 |
-
from .predict.pipeline import PredictionPipeline
|
53 |
-
except ImportError:
|
54 |
-
print("Fatal: Could not import prediction modules.")
|
55 |
-
print("Please ensure your project structure and python path are correct.")
|
56 |
-
return
|
57 |
-
|
58 |
-
pipeline = PredictionPipeline(models=MODELS_TO_RUN)
|
59 |
-
pipeline.update_models_if_new_data()
|
60 |
|
61 |
def main():
|
62 |
-
"""
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
# Execute requested pipeline(s)
|
66 |
if args.pipeline in ['scrape', 'all']:
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
if args.pipeline in ['analysis', 'all']:
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
if args.pipeline == 'update':
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
if args.pipeline in ['predict', 'all']:
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
if __name__ == '__main__':
|
79 |
main()
|
|
|
1 |
+
import argparse
|
2 |
import sys
|
3 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def main():
|
6 |
+
"""
|
7 |
+
Main entry point for the UFC data pipeline.
|
8 |
+
Supports scraping, analysis, and prediction workflows.
|
9 |
+
"""
|
10 |
+
parser = argparse.ArgumentParser(description="UFC Data Pipeline")
|
11 |
+
parser.add_argument(
|
12 |
+
'--pipeline',
|
13 |
+
type=str,
|
14 |
+
default='scrape',
|
15 |
+
choices=['scrape', 'analysis', 'predict', 'update', 'all'],
|
16 |
+
help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
|
17 |
+
)
|
18 |
+
parser.add_argument(
|
19 |
+
'--scrape-mode',
|
20 |
+
type=str,
|
21 |
+
default='full',
|
22 |
+
choices=['full', 'update'],
|
23 |
+
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
|
24 |
+
)
|
25 |
+
parser.add_argument(
|
26 |
+
'--num-events',
|
27 |
+
type=int,
|
28 |
+
default=5,
|
29 |
+
help="Number of latest events to scrape in update mode (default: 5)"
|
30 |
+
)
|
31 |
+
# Model management arguments for prediction pipeline
|
32 |
+
parser.add_argument(
|
33 |
+
'--use-existing-models',
|
34 |
+
action='store_true',
|
35 |
+
default=True,
|
36 |
+
help="Use existing saved models if available and no new data (default: True)."
|
37 |
+
)
|
38 |
+
parser.add_argument(
|
39 |
+
'--no-use-existing-models',
|
40 |
+
action='store_true',
|
41 |
+
default=False,
|
42 |
+
help="Force retrain all models from scratch, ignoring existing saved models."
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
'--force-retrain',
|
46 |
+
action='store_true',
|
47 |
+
default=False,
|
48 |
+
help="Force retrain all models even if no new data is available."
|
49 |
+
)
|
50 |
+
|
51 |
+
args = parser.parse_args()
|
52 |
|
|
|
53 |
if args.pipeline in ['scrape', 'all']:
|
54 |
+
print("=== Running Scraping Pipeline ===")
|
55 |
+
from src.scrape.main import main as scrape_main
|
56 |
+
|
57 |
+
# Override sys.argv to pass arguments to scrape.main
|
58 |
+
original_argv = sys.argv
|
59 |
+
sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
|
60 |
+
try:
|
61 |
+
scrape_main()
|
62 |
+
finally:
|
63 |
+
sys.argv = original_argv
|
64 |
|
65 |
if args.pipeline in ['analysis', 'all']:
|
66 |
+
print("\n=== Running ELO Analysis ===")
|
67 |
+
from src.analysis.elo import main as elo_main
|
68 |
+
elo_main()
|
69 |
+
|
70 |
if args.pipeline == 'update':
|
71 |
+
print("\n=== Running Model Update Pipeline ===")
|
72 |
+
try:
|
73 |
+
from src.predict.main import MODELS_TO_RUN
|
74 |
+
from src.predict.pipeline import PredictionPipeline
|
75 |
+
except ImportError:
|
76 |
+
print("Fatal: Could not import prediction modules.")
|
77 |
+
print("Please ensure your project structure and python path are correct.")
|
78 |
+
return
|
79 |
+
|
80 |
+
pipeline = PredictionPipeline(models=MODELS_TO_RUN)
|
81 |
+
pipeline.update_models_if_new_data()
|
82 |
|
83 |
if args.pipeline in ['predict', 'all']:
|
84 |
+
print("\n=== Running Prediction Pipeline ===")
|
85 |
+
from src.predict.main import main as predict_main
|
86 |
+
|
87 |
+
# Override sys.argv to pass model management arguments to predict.main
|
88 |
+
original_argv = sys.argv
|
89 |
+
predict_args = ['predict_main']
|
90 |
+
|
91 |
+
if args.no_use_existing_models:
|
92 |
+
predict_args.append('--no-use-existing-models')
|
93 |
+
elif args.use_existing_models:
|
94 |
+
predict_args.append('--use-existing-models')
|
95 |
+
|
96 |
+
if args.force_retrain:
|
97 |
+
predict_args.append('--force-retrain')
|
98 |
+
|
99 |
+
sys.argv = predict_args
|
100 |
+
try:
|
101 |
+
predict_main()
|
102 |
+
finally:
|
103 |
+
sys.argv = original_argv
|
104 |
|
105 |
if __name__ == '__main__':
|
106 |
main()
|
src/predict/main.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
|
|
|
2 |
from .pipeline import PredictionPipeline
|
3 |
from .models import (
|
4 |
EloBaselineModel,
|
@@ -10,34 +11,56 @@ from .models import (
|
|
10 |
LGBMModel
|
11 |
)
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
# BernoulliNBModel(),
|
26 |
-
LGBMModel(),
|
27 |
-
]
|
28 |
|
29 |
def main():
|
30 |
"""
|
31 |
Main entry point to run the prediction pipeline.
|
32 |
You can specify which models to run and the reporting format.
|
33 |
"""
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Handle conflicting arguments
|
37 |
use_existing_models = not args.no_use_existing_models and args.use_existing_models
|
38 |
force_retrain = args.force_retrain
|
39 |
|
40 |
-
# Log model management settings
|
41 |
if args.no_use_existing_models:
|
42 |
print("No-use-existing-models flag set: All models will be retrained from scratch.")
|
43 |
elif force_retrain:
|
@@ -45,9 +68,21 @@ def main():
|
|
45 |
elif use_existing_models:
|
46 |
print("Using existing models if available and no new data detected.")
|
47 |
|
48 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
pipeline = PredictionPipeline(
|
50 |
-
models=
|
51 |
use_existing_models=use_existing_models,
|
52 |
force_retrain=force_retrain
|
53 |
)
|
@@ -57,6 +92,3 @@ def main():
|
|
57 |
except FileNotFoundError as e:
|
58 |
print(f"Error: {e}")
|
59 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
60 |
-
except Exception as e:
|
61 |
-
print(f"An unexpected error occurred: {e}")
|
62 |
-
raise
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
from .pipeline import PredictionPipeline
|
4 |
from .models import (
|
5 |
EloBaselineModel,
|
|
|
11 |
LGBMModel
|
12 |
)
|
13 |
|
14 |
+
# --- Define Models to Run ---
|
15 |
+
# Instantiate all the models you want to evaluate here.
|
16 |
+
MODELS_TO_RUN = [
|
17 |
+
EloBaselineModel(),
|
18 |
+
LogisticRegressionModel(),
|
19 |
+
XGBoostModel(),
|
20 |
+
SVCModel(),
|
21 |
+
RandomForestModel(),
|
22 |
+
BernoulliNBModel(),
|
23 |
+
LGBMModel(),
|
24 |
+
]
|
25 |
+
# --- End of Model Definition ---
|
|
|
|
|
|
|
26 |
|
27 |
def main():
|
28 |
"""
|
29 |
Main entry point to run the prediction pipeline.
|
30 |
You can specify which models to run and the reporting format.
|
31 |
"""
|
32 |
+
parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
|
33 |
+
parser.add_argument(
|
34 |
+
'--report',
|
35 |
+
type=str,
|
36 |
+
default='detailed',
|
37 |
+
choices=['detailed', 'summary'],
|
38 |
+
help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
'--use-existing-models',
|
42 |
+
action='store_true',
|
43 |
+
default=True,
|
44 |
+
help="Use existing saved models if available and no new data (default: True)."
|
45 |
+
)
|
46 |
+
parser.add_argument(
|
47 |
+
'--no-use-existing-models',
|
48 |
+
action='store_true',
|
49 |
+
default=False,
|
50 |
+
help="Force retrain all models from scratch, ignoring existing saved models."
|
51 |
+
)
|
52 |
+
parser.add_argument(
|
53 |
+
'--force-retrain',
|
54 |
+
action='store_true',
|
55 |
+
default=False,
|
56 |
+
help="Force retrain all models even if no new data is available."
|
57 |
+
)
|
58 |
+
args = parser.parse_args()
|
59 |
|
60 |
# Handle conflicting arguments
|
61 |
use_existing_models = not args.no_use_existing_models and args.use_existing_models
|
62 |
force_retrain = args.force_retrain
|
63 |
|
|
|
64 |
if args.no_use_existing_models:
|
65 |
print("No-use-existing-models flag set: All models will be retrained from scratch.")
|
66 |
elif force_retrain:
|
|
|
68 |
elif use_existing_models:
|
69 |
print("Using existing models if available and no new data detected.")
|
70 |
|
71 |
+
# --- Define Models to Run ---
|
72 |
+
# Instantiate all the models you want to evaluate here.
|
73 |
+
models_to_run = [
|
74 |
+
EloBaselineModel(),
|
75 |
+
LogisticRegressionModel(),
|
76 |
+
XGBoostModel(),
|
77 |
+
SVCModel(),
|
78 |
+
RandomForestModel(),
|
79 |
+
BernoulliNBModel(),
|
80 |
+
LGBMModel(),
|
81 |
+
]
|
82 |
+
# --- End of Model Definition ---
|
83 |
+
|
84 |
pipeline = PredictionPipeline(
|
85 |
+
models=MODELS_TO_RUN,
|
86 |
use_existing_models=use_existing_models,
|
87 |
force_retrain=force_retrain
|
88 |
)
|
|
|
92 |
except FileNotFoundError as e:
|
93 |
print(f"Error: {e}")
|
94 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
|
|
|
|
|
src/predict/models.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
from abc import ABC, abstractmethod
|
|
|
|
|
2 |
import pandas as pd
|
3 |
from sklearn.linear_model import LogisticRegression
|
4 |
from sklearn.svm import SVC
|
@@ -8,128 +10,188 @@ from xgboost import XGBClassifier
|
|
8 |
from lightgbm import LGBMClassifier
|
9 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
10 |
from ..config import FIGHTERS_CSV_PATH
|
11 |
-
from .preprocess import preprocess_for_ml
|
12 |
|
13 |
class BaseModel(ABC):
|
14 |
-
"""
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
@abstractmethod
|
20 |
def train(self, train_fights):
|
21 |
-
"""
|
|
|
|
|
|
|
|
|
22 |
pass
|
23 |
|
24 |
@abstractmethod
|
25 |
def predict(self, fight):
|
26 |
-
"""
|
27 |
-
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
class EloBaselineModel(BaseModel):
|
34 |
-
"""
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
def train(self, train_fights):
|
37 |
-
"""
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
42 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
43 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
44 |
-
|
45 |
-
# Calculate ELO ratings
|
46 |
-
elo_ratings = process_fights_for_elo(train_fights)
|
47 |
-
self.fighters_df['elo'] = pd.Series(elo_ratings)
|
48 |
-
self.fighters_df['elo'] = self.fighters_df['elo'].fillna(INITIAL_ELO)
|
49 |
-
|
50 |
-
print("ELO ratings calculated for all fighters.")
|
51 |
|
52 |
def predict(self, fight):
|
53 |
-
"""
|
54 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
55 |
|
56 |
try:
|
57 |
f1_elo = self.fighters_df.loc[f1_name, 'elo']
|
58 |
f2_elo = self.fighters_df.loc[f2_name, 'elo']
|
59 |
|
60 |
-
# Calculate win probability using ELO formula
|
61 |
prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
except KeyError as e:
|
69 |
print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
|
70 |
-
return
|
71 |
|
72 |
class BaseMLModel(BaseModel):
|
73 |
-
"""
|
74 |
-
|
|
|
|
|
75 |
def __init__(self, model):
|
76 |
-
super().__init__()
|
77 |
if model is None:
|
78 |
raise ValueError("A model must be provided.")
|
79 |
self.model = model
|
|
|
|
|
80 |
|
81 |
def train(self, train_fights):
|
82 |
-
"""
|
83 |
-
|
|
|
|
|
|
|
84 |
|
85 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
|
87 |
print(f"Fitting model on {X_train.shape[0]} samples...")
|
88 |
self.model.fit(X_train, y_train)
|
89 |
print("Model training complete.")
|
90 |
|
91 |
def predict(self, fight):
|
92 |
-
"""
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
# Concrete ML model implementations
|
113 |
class LogisticRegressionModel(BaseMLModel):
|
|
|
114 |
def __init__(self):
|
115 |
-
super().__init__(LogisticRegression(
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
class SVCModel(BaseMLModel):
|
|
|
118 |
def __init__(self):
|
119 |
-
|
|
|
120 |
|
121 |
class RandomForestModel(BaseMLModel):
|
|
|
122 |
def __init__(self):
|
123 |
-
super().__init__(RandomForestClassifier(
|
124 |
|
125 |
class BernoulliNBModel(BaseMLModel):
|
|
|
126 |
def __init__(self):
|
127 |
-
super().__init__(BernoulliNB())
|
128 |
-
|
129 |
-
class XGBoostModel(BaseMLModel):
|
130 |
-
def __init__(self):
|
131 |
-
super().__init__(XGBClassifier(random_state=42))
|
132 |
|
133 |
class LGBMModel(BaseMLModel):
|
|
|
134 |
def __init__(self):
|
135 |
-
super().__init__(LGBMClassifier(random_state=42))
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
import pandas as pd
|
5 |
from sklearn.linear_model import LogisticRegression
|
6 |
from sklearn.svm import SVC
|
|
|
10 |
from lightgbm import LGBMClassifier
|
11 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
12 |
from ..config import FIGHTERS_CSV_PATH
|
13 |
+
from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
|
14 |
|
15 |
class BaseModel(ABC):
|
16 |
+
"""
|
17 |
+
Abstract base class for all prediction models.
|
18 |
+
Ensures that every model has a standard interface for training and prediction.
|
19 |
+
"""
|
|
|
20 |
@abstractmethod
|
21 |
def train(self, train_fights):
|
22 |
+
"""
|
23 |
+
Trains or prepares the model using historical fight data.
|
24 |
+
|
25 |
+
:param train_fights: A list of historical fight data dictionaries.
|
26 |
+
"""
|
27 |
pass
|
28 |
|
29 |
@abstractmethod
|
30 |
def predict(self, fight):
|
31 |
+
"""
|
32 |
+
Predicts the winner of a single fight.
|
33 |
|
34 |
+
:param fight: A dictionary representing a single fight.
|
35 |
+
:return: The name of the predicted winning fighter.
|
36 |
+
"""
|
37 |
+
pass
|
38 |
|
39 |
class EloBaselineModel(BaseModel):
|
40 |
+
"""
|
41 |
+
A baseline prediction model that predicts the winner based on the higher ELO rating.
|
42 |
+
"""
|
43 |
+
def __init__(self):
|
44 |
+
self.fighters_df = None
|
45 |
+
|
46 |
def train(self, train_fights):
|
47 |
+
"""
|
48 |
+
For the ELO baseline, 'training' simply consists of loading the fighter data
|
49 |
+
to access their ELO scores during prediction.
|
50 |
+
"""
|
51 |
+
print("Training EloBaselineModel: Loading fighter ELO data...")
|
52 |
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
53 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
54 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def predict(self, fight):
|
57 |
+
"""Predicts the winner based on ELO and calculates win probability."""
|
58 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
59 |
|
60 |
try:
|
61 |
f1_elo = self.fighters_df.loc[f1_name, 'elo']
|
62 |
f2_elo = self.fighters_df.loc[f2_name, 'elo']
|
63 |
|
64 |
+
# Calculate win probability for fighter 1 using the ELO formula
|
65 |
prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
|
66 |
+
|
67 |
+
if prob_f1_wins >= 0.5:
|
68 |
+
return {'winner': f1_name, 'probability': prob_f1_wins}
|
69 |
+
else:
|
70 |
+
return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
|
71 |
+
|
72 |
except KeyError as e:
|
73 |
print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
|
74 |
+
return {'winner': None, 'probability': None}
|
75 |
|
76 |
class BaseMLModel(BaseModel):
|
77 |
+
"""
|
78 |
+
An abstract base class for machine learning models that handles all common
|
79 |
+
data preparation, training, and prediction logic.
|
80 |
+
"""
|
81 |
def __init__(self, model):
|
|
|
82 |
if model is None:
|
83 |
raise ValueError("A model must be provided.")
|
84 |
self.model = model
|
85 |
+
self.fighters_df = None
|
86 |
+
self.fighter_histories = {}
|
87 |
|
88 |
def train(self, train_fights):
|
89 |
+
"""
|
90 |
+
Trains the machine learning model. This involves loading fighter data,
|
91 |
+
pre-calculating histories, and fitting the model on the preprocessed data.
|
92 |
+
"""
|
93 |
+
print(f"--- Training {self.model.__class__.__name__} ---")
|
94 |
|
95 |
+
# 1. Prepare data for prediction-time feature generation
|
96 |
+
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
97 |
+
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
98 |
+
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
99 |
+
for col in ['height_cm', 'reach_in', 'elo']:
|
100 |
+
if col in self.fighters_df.columns:
|
101 |
+
self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
|
102 |
+
|
103 |
+
# 2. Pre-calculate fighter histories
|
104 |
+
train_fights_with_dates = []
|
105 |
+
for fight in train_fights:
|
106 |
+
fight['date_obj'] = pd.to_datetime(fight['event_date'])
|
107 |
+
train_fights_with_dates.append(fight)
|
108 |
+
for fighter_name in self.fighters_df.index:
|
109 |
+
history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
110 |
+
self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
111 |
+
|
112 |
+
# 3. Preprocess and fit
|
113 |
X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
|
114 |
print(f"Fitting model on {X_train.shape[0]} samples...")
|
115 |
self.model.fit(X_train, y_train)
|
116 |
print("Model training complete.")
|
117 |
|
118 |
def predict(self, fight):
|
119 |
+
"""
|
120 |
+
Predicts the outcome of a single fight, returning the winner and probability.
|
121 |
+
"""
|
122 |
+
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
123 |
+
fight_date = pd.to_datetime(fight['event_date'])
|
124 |
+
|
125 |
+
if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
|
126 |
+
print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
|
127 |
+
return {'winner': None, 'probability': None}
|
128 |
+
|
129 |
+
f1_stats = self.fighters_df.loc[f1_name]
|
130 |
+
f2_stats = self.fighters_df.loc[f2_name]
|
131 |
+
if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
|
132 |
+
if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
|
133 |
|
134 |
+
f1_hist = self.fighter_histories.get(f1_name, [])
|
135 |
+
f2_hist = self.fighter_histories.get(f2_name, [])
|
136 |
+
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
|
137 |
+
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
|
138 |
|
139 |
+
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
140 |
+
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
141 |
+
|
142 |
+
features = {
|
143 |
+
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
144 |
+
'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
145 |
+
'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
146 |
+
'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
|
147 |
+
'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
|
148 |
+
'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
|
149 |
+
'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
|
150 |
+
'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
151 |
+
'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
152 |
+
'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
|
153 |
+
'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
|
154 |
+
}
|
155 |
+
|
156 |
+
feature_vector = pd.DataFrame([features]).fillna(0)
|
157 |
+
|
158 |
+
# Use predict_proba to get probabilities for each class
|
159 |
+
probabilities = self.model.predict_proba(feature_vector)[0]
|
160 |
+
prob_f1_wins = probabilities[1] # Probability of class '1' (fighter 1 wins)
|
161 |
+
|
162 |
+
if prob_f1_wins >= 0.5:
|
163 |
+
return {'winner': f1_name, 'probability': prob_f1_wins}
|
164 |
+
else:
|
165 |
+
return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
|
166 |
|
|
|
167 |
class LogisticRegressionModel(BaseMLModel):
|
168 |
+
"""A thin wrapper for scikit-learn's LogisticRegression."""
|
169 |
def __init__(self):
|
170 |
+
super().__init__(model=LogisticRegression())
|
171 |
+
|
172 |
+
class XGBoostModel(BaseMLModel):
|
173 |
+
"""A thin wrapper for XGBoost's XGBClassifier."""
|
174 |
+
def __init__(self):
|
175 |
+
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
|
176 |
+
super().__init__(model=model)
|
177 |
|
178 |
class SVCModel(BaseMLModel):
|
179 |
+
"""A thin wrapper for scikit-learn's Support Vector Classifier."""
|
180 |
def __init__(self):
|
181 |
+
# Probability=True is needed for some reports, though it slows down training
|
182 |
+
super().__init__(model=SVC(probability=True, random_state=42))
|
183 |
|
184 |
class RandomForestModel(BaseMLModel):
|
185 |
+
"""A thin wrapper for scikit-learn's RandomForestClassifier."""
|
186 |
def __init__(self):
|
187 |
+
super().__init__(model=RandomForestClassifier(random_state=42))
|
188 |
|
189 |
class BernoulliNBModel(BaseMLModel):
|
190 |
+
"""A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
|
191 |
def __init__(self):
|
192 |
+
super().__init__(model=BernoulliNB())
|
|
|
|
|
|
|
|
|
193 |
|
194 |
class LGBMModel(BaseMLModel):
|
195 |
+
"""A thin wrapper for LightGBM's LGBMClassifier."""
|
196 |
def __init__(self):
|
197 |
+
super().__init__(model=LGBMClassifier(random_state=42))
|
src/predict/preprocess.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
|
|
4 |
|
5 |
def _clean_numeric_column(series):
|
6 |
-
"""
|
7 |
series_str = series.astype(str)
|
8 |
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
9 |
|
10 |
def _calculate_age(dob_str, fight_date_str):
|
11 |
-
"""
|
12 |
if pd.isna(dob_str) or not dob_str:
|
13 |
return None
|
14 |
try:
|
@@ -18,235 +19,213 @@ def _calculate_age(dob_str, fight_date_str):
|
|
18 |
except (ValueError, TypeError):
|
19 |
return None
|
20 |
|
21 |
-
def
|
22 |
-
"""
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
return streak
|
39 |
|
40 |
-
def _to_int_safe(
|
41 |
-
"""Safely
|
|
|
|
|
42 |
try:
|
43 |
-
|
|
|
44 |
except (ValueError, TypeError):
|
45 |
return 0
|
46 |
|
47 |
-
def _get_fighter_history_stats(fighter_name, current_fight_date,
|
48 |
-
"""
|
49 |
-
|
50 |
-
|
51 |
-
past_fights =
|
52 |
-
last_n_fights = past_fights[-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
stats = {
|
55 |
-
'
|
56 |
-
'
|
57 |
-
'
|
58 |
-
'first_round_finishes': 0,
|
59 |
-
'knockdowns_scored': 0,
|
60 |
-
'knockdowns_absorbed': 0,
|
61 |
-
'sig_str_landed': 0,
|
62 |
-
'sig_str_attempted': 0,
|
63 |
-
'takedowns_landed': 0,
|
64 |
-
'takedowns_attempted': 0,
|
65 |
-
'sub_attempts': 0,
|
66 |
-
'ctrl_time_sec': 0,
|
67 |
-
'total_fight_time_sec': 0,
|
68 |
-
'fights_in_last_year': 0,
|
69 |
-
'avg_opp_elo_last_n': 0
|
70 |
}
|
71 |
-
|
72 |
-
# Calculate fights in last year
|
73 |
-
one_year_ago = current_fight_date - pd.Timedelta(days=365)
|
74 |
-
stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
|
75 |
-
|
76 |
-
# Process each fight
|
77 |
-
total_opp_elo = 0
|
78 |
for fight in last_n_fights:
|
79 |
is_fighter_1 = (fight['fighter_1'] == fighter_name)
|
80 |
-
f_prefix = 'f1' if is_fighter_1 else 'f2'
|
81 |
-
opp_prefix = 'f2' if is_fighter_1 else 'f1'
|
82 |
opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
|
83 |
|
84 |
-
|
|
|
85 |
if fight['winner'] == fighter_name:
|
86 |
-
stats['
|
87 |
-
if fight['method'] != 'Decision':
|
88 |
-
stats['total_finishes'] += 1
|
89 |
-
if fight['round'] == '1':
|
90 |
-
stats['first_round_finishes'] += 1
|
91 |
if 'KO' in fight['method']:
|
92 |
stats['ko_wins'] += 1
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
|
96 |
-
stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
|
97 |
-
stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
|
98 |
-
stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
|
99 |
-
stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
|
100 |
-
stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
|
101 |
-
stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
|
102 |
-
|
103 |
-
# Control Time
|
104 |
-
ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
|
105 |
-
if isinstance(ctrl_time, str) and ':' in ctrl_time:
|
106 |
-
mins, secs = map(int, ctrl_time.split(':'))
|
107 |
-
stats['ctrl_time_sec'] += mins * 60 + secs
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
# Calculate averages and rates
|
123 |
-
n_actual_fights = len(last_n_fights)
|
124 |
-
|
125 |
-
# Always provide all required keys with default values
|
126 |
-
stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
127 |
-
stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
128 |
-
stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
129 |
-
stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
130 |
-
stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
131 |
-
stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
|
132 |
-
|
133 |
-
# Per-minute stats
|
134 |
-
total_mins = stats['total_fight_time_sec'] / 60
|
135 |
-
stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
|
136 |
-
stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
|
137 |
-
stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
|
138 |
-
stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
|
139 |
-
|
140 |
-
# Accuracy stats
|
141 |
-
stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
|
142 |
-
stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
|
143 |
-
stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
|
144 |
|
145 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
148 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
if not os.path.exists(fighters_csv_path):
|
150 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
151 |
|
152 |
-
# Load and prepare fighter data
|
153 |
fighters_df = pd.read_csv(fighters_csv_path)
|
154 |
-
fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
|
155 |
-
fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
for col in ['height_cm', 'reach_in', 'elo']:
|
158 |
-
if col in
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
for fight in fights_to_process:
|
|
|
164 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
165 |
-
|
166 |
-
|
167 |
-
if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
|
168 |
continue
|
169 |
-
|
170 |
-
|
171 |
-
f1_stats = fighters_df.loc[f1_name]
|
172 |
-
f2_stats = fighters_df.loc[f2_name]
|
173 |
-
|
174 |
-
# Calculate fight date and ensure date_obj is available
|
175 |
-
fight_date = pd.to_datetime(fight['event_date'])
|
176 |
-
fight['date_obj'] = fight_date
|
177 |
-
|
178 |
-
# Get fighter histories and ensure date_obj is available for all fights
|
179 |
-
f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
|
180 |
-
f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
|
181 |
-
|
182 |
-
# Ensure date_obj is available for all historical fights
|
183 |
-
for hist_fight in f1_hist + f2_hist:
|
184 |
-
if 'date_obj' not in hist_fight:
|
185 |
-
hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
|
186 |
-
|
187 |
-
# Calculate historical stats
|
188 |
-
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
|
189 |
-
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
|
190 |
|
191 |
-
|
|
|
|
|
|
|
192 |
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
193 |
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
#
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
|
201 |
-
f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
|
202 |
-
|
203 |
-
# Compile all features
|
204 |
-
feature_dict = {
|
205 |
-
'winner': 1 if fight.get('winner') == f1_name else 0,
|
206 |
-
'date': fight['event_date'],
|
207 |
-
'fighter_1': f1_name,
|
208 |
-
'fighter_2': f2_name,
|
209 |
-
|
210 |
-
# Physical differences
|
211 |
-
'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
212 |
-
'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
213 |
-
'age_diff': (f1_age or 0) - (f2_age or 0),
|
214 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
215 |
-
|
216 |
-
|
217 |
-
'
|
218 |
-
'
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
'
|
223 |
-
'
|
224 |
-
|
225 |
-
'
|
226 |
-
'
|
227 |
-
'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
|
228 |
-
|
229 |
-
# Defense differences
|
230 |
-
'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
|
231 |
-
'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
|
232 |
-
'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
|
233 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
X =
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
return X, y, metadata
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
4 |
+
from ..config import FIGHTERS_CSV_PATH
|
5 |
|
6 |
def _clean_numeric_column(series):
|
7 |
+
"""A helper to clean string columns into numbers, handling errors."""
|
8 |
series_str = series.astype(str)
|
9 |
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
10 |
|
11 |
def _calculate_age(dob_str, fight_date_str):
|
12 |
+
"""Calculates age in years from a date of birth string and fight date string."""
|
13 |
if pd.isna(dob_str) or not dob_str:
|
14 |
return None
|
15 |
try:
|
|
|
19 |
except (ValueError, TypeError):
|
20 |
return None
|
21 |
|
22 |
+
def _parse_round_time_to_seconds(round_str, time_str):
|
23 |
+
"""Converts fight duration from round and time to total seconds."""
|
24 |
+
try:
|
25 |
+
rounds = int(round_str)
|
26 |
+
minutes, seconds = map(int, time_str.split(':'))
|
27 |
+
# Assuming 5-minute rounds for calculation simplicity
|
28 |
+
return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
|
29 |
+
except (ValueError, TypeError, AttributeError):
|
30 |
+
return 0
|
31 |
+
|
32 |
+
def _parse_striking_stats(stat_str):
|
33 |
+
"""Parses striking stats string like '10 of 20' into (landed, attempted)."""
|
34 |
+
try:
|
35 |
+
landed, attempted = map(int, stat_str.split(' of '))
|
36 |
+
return landed, attempted
|
37 |
+
except (ValueError, TypeError, AttributeError):
|
38 |
+
return 0, 0
|
|
|
39 |
|
40 |
+
def _to_int_safe(val):
|
41 |
+
"""Safely converts a value to an integer, returning 0 if it's invalid or empty."""
|
42 |
+
if pd.isna(val):
|
43 |
+
return 0
|
44 |
try:
|
45 |
+
# handle strings with whitespace or empty strings
|
46 |
+
return int(str(val).strip() or 0)
|
47 |
except (ValueError, TypeError):
|
48 |
return 0
|
49 |
|
50 |
+
def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
|
51 |
+
"""
|
52 |
+
Calculates performance statistics for a fighter based on their last n fights.
|
53 |
+
"""
|
54 |
+
past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
|
55 |
+
last_n_fights = past_fights[-n:]
|
56 |
+
|
57 |
+
if not last_n_fights:
|
58 |
+
# Return a default dictionary with the correct keys for a fighter with no history
|
59 |
+
return {
|
60 |
+
'wins_last_n': 0,
|
61 |
+
'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
|
62 |
+
'ko_percent_last_n': 0,
|
63 |
+
'sig_str_landed_per_min_last_n': 0,
|
64 |
+
'takedown_accuracy_last_n': 0,
|
65 |
+
'sub_attempts_per_min_last_n': 0,
|
66 |
+
}
|
67 |
+
|
68 |
stats = {
|
69 |
+
'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
|
70 |
+
'sig_str_landed': 0, 'opponent_elos': [],
|
71 |
+
'td_landed': 0, 'td_attempted': 0, 'sub_attempts': 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
}
|
73 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
for fight in last_n_fights:
|
75 |
is_fighter_1 = (fight['fighter_1'] == fighter_name)
|
|
|
|
|
76 |
opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
|
77 |
|
78 |
+
f_prefix = 'f1' if is_fighter_1 else 'f2'
|
79 |
+
|
80 |
if fight['winner'] == fighter_name:
|
81 |
+
stats['wins'] += 1
|
|
|
|
|
|
|
|
|
82 |
if 'KO' in fight['method']:
|
83 |
stats['ko_wins'] += 1
|
84 |
+
|
85 |
+
if opponent_name in fighters_df.index:
|
86 |
+
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
87 |
+
stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
|
88 |
|
89 |
+
stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
|
92 |
+
landed, _ = _parse_striking_stats(sig_str_stat)
|
93 |
+
stats['sig_str_landed'] += landed
|
94 |
+
|
95 |
+
td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
|
96 |
+
td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
|
97 |
+
stats['td_landed'] += td_landed
|
98 |
+
stats['td_attempted'] += td_attempted
|
99 |
|
100 |
+
stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))
|
101 |
+
|
102 |
+
# Final calculations
|
103 |
+
avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
|
104 |
+
total_minutes = stats['total_time_secs'] / 60 if stats['total_time_secs'] > 0 else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
return {
|
107 |
+
'wins_last_n': stats['wins'],
|
108 |
+
'avg_opp_elo_last_n': avg_opp_elo,
|
109 |
+
'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
|
110 |
+
'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] / total_minutes) if total_minutes > 0 else 0,
|
111 |
+
'takedown_accuracy_last_n': (stats['td_landed'] / stats['td_attempted']) if stats['td_attempted'] > 0 else 0,
|
112 |
+
'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
|
113 |
+
}
|
114 |
|
115 |
def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
116 |
+
"""
|
117 |
+
Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
|
118 |
+
suitable for a binary classification machine learning model.
|
119 |
+
|
120 |
+
Args:
|
121 |
+
fights_to_process (list of dict): The list of fights to process.
|
122 |
+
fighters_csv_path (str): Path to the CSV file with all fighter stats.
|
123 |
+
|
124 |
+
Returns:
|
125 |
+
pd.DataFrame: Feature matrix X.
|
126 |
+
pd.Series: Target vector y.
|
127 |
+
pd.DataFrame: Metadata DataFrame.
|
128 |
+
"""
|
129 |
if not os.path.exists(fighters_csv_path):
|
130 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
131 |
|
|
|
132 |
fighters_df = pd.read_csv(fighters_csv_path)
|
|
|
|
|
133 |
|
134 |
+
# 1. Prepare fighters data for merging
|
135 |
+
fighters_prepared = fighters_df.copy()
|
136 |
+
fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
|
137 |
+
|
138 |
+
# Handle duplicate fighter names by keeping the first entry
|
139 |
+
fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
|
140 |
+
fighters_prepared = fighters_prepared.set_index('full_name')
|
141 |
+
|
142 |
for col in ['height_cm', 'reach_in', 'elo']:
|
143 |
+
if col in fighters_prepared.columns:
|
144 |
+
fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
|
145 |
+
|
146 |
+
# 2. Pre-calculate fighter histories to speed up lookups
|
147 |
+
# And convert date strings to datetime objects once
|
148 |
+
for fight in fights_to_process:
|
149 |
+
try:
|
150 |
+
# This will work if event_date is a string
|
151 |
+
fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
|
152 |
+
except TypeError:
|
153 |
+
# This will be triggered if it's already a date-like object (e.g., Timestamp)
|
154 |
+
fight['date_obj'] = fight['event_date']
|
155 |
|
156 |
+
fighter_histories = {}
|
157 |
+
for fighter_name in fighters_prepared.index:
|
158 |
+
history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
159 |
+
fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
160 |
+
|
161 |
+
# 3. Process fights to create features and targets
|
162 |
+
feature_list = []
|
163 |
+
target_list = []
|
164 |
+
metadata_list = []
|
165 |
+
|
166 |
for fight in fights_to_process:
|
167 |
+
# Per the dataset's design, fighter_1 is always the winner.
|
168 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
169 |
+
|
170 |
+
if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
|
|
|
171 |
continue
|
172 |
+
|
173 |
+
f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
|
176 |
+
if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
|
177 |
+
|
178 |
+
# Calculate ages for both fighters
|
179 |
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
180 |
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
181 |
+
|
182 |
+
# Get historical stats for both fighters
|
183 |
+
f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
|
184 |
+
f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
|
185 |
|
186 |
+
# --- Create two training examples from each fight for a balanced dataset ---
|
187 |
+
|
188 |
+
# 1. The "Win" case: (fighter_1 - fighter_2)
|
189 |
+
features_win = {
|
190 |
+
# Original diffs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
192 |
+
'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
193 |
+
'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
194 |
+
'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
|
195 |
+
'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
|
196 |
+
# New historical diffs
|
197 |
+
'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
|
198 |
+
'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
|
199 |
+
'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
200 |
+
'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
201 |
+
# Grappling features
|
202 |
+
'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
|
203 |
+
'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
}
|
205 |
+
feature_list.append(features_win)
|
206 |
+
target_list.append(1) # 1 represents a win
|
207 |
+
|
208 |
+
# 2. The "Loss" case: (fighter_2 - fighter_1)
|
209 |
+
# We invert the differences for the losing case.
|
210 |
+
features_loss = {key: -value for key, value in features_win.items()}
|
211 |
+
# Stance difference is symmetric; it doesn't get inverted.
|
212 |
+
features_loss['stance_is_different'] = features_win['stance_is_different']
|
213 |
|
214 |
+
feature_list.append(features_loss)
|
215 |
+
target_list.append(0) # 0 represents a loss
|
216 |
+
|
217 |
+
# Add metadata for both generated samples
|
218 |
+
# The 'winner' and 'loser' are consistent with the original data structure
|
219 |
+
metadata_list.append({
|
220 |
+
'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
|
221 |
+
})
|
222 |
+
metadata_list.append({
|
223 |
+
'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
|
224 |
+
})
|
225 |
+
|
226 |
+
X = pd.DataFrame(feature_list).fillna(0)
|
227 |
+
y = pd.Series(target_list, name='winner')
|
228 |
+
metadata = pd.DataFrame(metadata_list)
|
229 |
+
|
230 |
+
print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
|
231 |
return X, y, metadata
|