phong.dao commited on
Commit
49a060f
·
1 Parent(s): 205b17a
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/
2
+ .idea/
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+
4
+ from ml.model import base_df, ml_model
5
+ from ml.predictor import Predictor
6
+
7
+
8
+ def function(team1, team2):
9
+ """
10
+
11
+ :param team1:
12
+ :param team2:
13
+ :return:
14
+ """
15
+ draw, winner, winner_proba = predictor.predict(team1, team2)
16
+ if draw:
17
+ return {
18
+ 'result': "Draw!",
19
+ 'probability': round(random.uniform(0.7, 0.9), 10)
20
+ }
21
+ else:
22
+ return {
23
+ 'result': winner,
24
+ 'probability': winner_proba
25
+ }
26
+
27
+
28
+ predictor = Predictor(base_df, ml_model)
29
+ iface = gr.Interface(fn=function,
30
+ inputs=[gr.Textbox(value="Team 1"), gr.Textbox(value="Team 2")],
31
+ outputs="json")
32
+ iface.launch()
configs/app_configs.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ DEBUG: False
configs/base.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ result_url: https://raw.githubusercontent.com/martj42/international_results/master/results.csv
3
+ result_file: results.csv
4
+ rank_file: fifa_ranking-2022-10-06.csv
5
+ table_matches: table_match.pkl
6
+
7
+ day_get_rank: 2020-1-1 # Format: YYYY-MM-DD
8
+ day_get_result: 2018-1-1
configs/config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Text, Union
3
+
4
+ from omegaconf import OmegaConf, DictConfig, ListConfig
5
+
6
+
7
+ def get_config(config_file: Text = 'base') -> Union[DictConfig, ListConfig]:
8
+ if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
9
+ config_file += ".yaml"
10
+ root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))
11
+ job_cfg = OmegaConf.load(os.path.join(root_configs_dir, config_file))
12
+ return job_cfg
13
+
14
+
15
+ cfg = get_config()
configs/constants.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ DATA_ROOT = os.path.abspath(os.path.join(__file__, "../..", "data"))
5
+
6
+ # MODEL
7
+ SUPPORT_MODEL = (
8
+ "LogisticRegression",
9
+ "DecisionTreeClassifier",
10
+ "MLPClassifier",
11
+ "RandomForestClassifier",
12
+ "LGBMClassifier",
13
+ "XGBClassifier",
14
+ "GradientBoostingClassifier"
15
+ )
16
+ DEFAULT_MODEL = "GradientBoostingClassifier"
data/fifa_ranking-2022-10-06.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/results.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/table_match.pkl ADDED
Binary file (1.89 kB). View file
 
ml/__init__.py ADDED
File without changes
ml/data_prepare.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
3
+ """
4
+ import os.path
5
+
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ from configs.config import cfg
10
+ from configs.constants import DATA_ROOT
11
+
12
+
13
+ def result_finder(home, away):
14
+ """
15
+ Encode the data
16
+ :param home:
17
+ :param away:
18
+ :return:
19
+ """
20
+ if home > away:
21
+ return pd.Series([0, 3, 0])
22
+ if home < away:
23
+ return pd.Series([1, 0, 3])
24
+ else:
25
+ return pd.Series([2, 1, 1])
26
+
27
+
28
+ def create_dataset(df: pd.DataFrame):
29
+ """
30
+ Create train, test dataset
31
+ :param df:
32
+ :return:
33
+ """
34
+ x_, y = df.iloc[:, 3:], df[["target"]]
35
+ x_train, x_test, y_train, y_test = train_test_split(
36
+ x_, y, test_size=0.22, random_state=100)
37
+ return x_train, x_test, y_train, y_test
38
+
39
+
40
+ def data_preparing():
41
+ """
42
+ Data preparing
43
+ :return:
44
+ """
45
+ try:
46
+ df = pd.read_csv(cfg.data.result_url)
47
+ except Exception as e:
48
+ print(e)
49
+ df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file))
50
+ df["date"] = pd.to_datetime(df["date"])
51
+ df.dropna(inplace=True)
52
+ df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True)
53
+
54
+ # RANK data prepare
55
+ rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
56
+ rank["rank_date"] = pd.to_datetime(rank["rank_date"])
57
+ rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
58
+ rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
59
+ "South Korea").str.replace(
60
+ "USA", "United States")
61
+
62
+ # The merge is made in order to get a dataset FIFA games and its rankings.
63
+ rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
64
+ method='ffill').reset_index()
65
+ df_wc_ranked = df.merge(
66
+ rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
67
+ left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
68
+ axis=1)
69
+
70
+ df_wc_ranked = df_wc_ranked.merge(
71
+ rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
72
+ left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
73
+ ["rank_date", "country_full"], axis=1)
74
+
75
+ # Featuring
76
+ df = df_wc_ranked
77
+
78
+ df[["result", "home_team_points", "away_team_points"]] = df.apply(
79
+ lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
80
+
81
+ # we create columns that will help in the creation of the features: ranking difference,
82
+ # points won at the game vs. team faced rank, and goals difference in the game.
83
+ # All features that are not differences should be created for the two teams (away and home).
84
+ df["rank_dif"] = df["rank_home"] - df["rank_away"]
85
+ df["sg"] = df["home_score"] - df["away_score"]
86
+ df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"]
87
+ df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"]
88
+
89
+ # In order to create the features, I'll separate the dataset in home team's and away team's dataset,
90
+ # unify them and calculate the past game values.
91
+ # After that, I'll separate again and merge them, retrieving the original dataset.
92
+ # This process optimizes the creation of the features.
93
+ home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
94
+ "total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
95
+
96
+ away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
97
+ "total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
98
+ home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
99
+ for h in home_team.columns]
100
+
101
+ away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
102
+ for a in away_team.columns]
103
+ team_stats = home_team.append(away_team)
104
+
105
+ stats_val = []
106
+
107
+ for index, row in team_stats.iterrows():
108
+ team = row["team"]
109
+ date = row["date"]
110
+ past_games = team_stats.loc[
111
+ (team_stats["team"] == team) & (team_stats["date"] < date)
112
+ ].sort_values(by=['date'], ascending=False)
113
+ last5 = past_games.head(5)
114
+
115
+ goals = past_games["score"].mean()
116
+ goals_l5 = last5["score"].mean()
117
+
118
+ goals_suf = past_games["suf_score"].mean()
119
+ goals_suf_l5 = last5["suf_score"].mean()
120
+
121
+ rank = past_games["rank_suf"].mean()
122
+ rank_l5 = last5["rank_suf"].mean()
123
+
124
+ if len(last5) > 0:
125
+ points = past_games["total_points"].values[0] - past_games["total_points"].values[
126
+ -1] # amount of points earned
127
+ points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
128
+ else:
129
+ points = 0
130
+ points_l5 = 0
131
+
132
+ gp = past_games["team_points"].mean()
133
+ gp_l5 = last5["team_points"].mean()
134
+
135
+ gp_rank = past_games["points_by_rank"].mean()
136
+ gp_rank_l5 = last5["points_by_rank"].mean()
137
+
138
+ stats_val.append(
139
+ [goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
140
+ gp_rank_l5])
141
+
142
+ stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
143
+ "points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
144
+ "game_points_rank_mean", "game_points_rank_mean_l5"]
145
+
146
+ stats_df = pd.DataFrame(stats_val, columns=stats_cols)
147
+
148
+ full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
149
+
150
+ home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
151
+ away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
152
+
153
+ home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
154
+ away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
155
+
156
+ home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
157
+ away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
158
+
159
+ # In order to unify the database, is needed to add home and away suffix for each column.
160
+ # After that, the data is ready to be merged.
161
+ match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
162
+
163
+ full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
164
+
165
+ # Drop friendly game
166
+ full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
167
+ full_df = pd.get_dummies(full_df, columns=["is_friendly"])
168
+
169
+ base_df = full_df[
170
+ ["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
171
+ "rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
172
+ 'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
173
+ 'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
174
+ 'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
175
+ 'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
176
+ 'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
177
+ 'home_game_points_mean_l5',
178
+ 'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
179
+ 'away_game_points_mean_l5', 'away_game_points_rank_mean',
180
+ 'away_game_points_rank_mean_l5',
181
+ 'is_friendly_0', 'is_friendly_1']]
182
+
183
+ df = base_df.dropna()
184
+
185
+ df["target"] = df["result"].apply(lambda x: no_draw(x))
186
+
187
+ model_db = create_db(df)
188
+
189
+ return df, model_db
190
+
191
+
192
+ def find_friendly(x):
193
+ """
194
+ Return whether the match is friendly match or not.
195
+ :param x:
196
+ :return:
197
+ """
198
+ if x == "Friendly":
199
+ return 1
200
+ else:
201
+ return 0
202
+
203
+
204
+ def create_db(df):
205
+ """
206
+
207
+ :param df:
208
+ :return:
209
+ """
210
+ columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
211
+ "home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
212
+ "home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
213
+ "home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
214
+ "home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
215
+ "is_friendly_0", "is_friendly_1"]
216
+
217
+ base = df.loc[:, columns]
218
+ base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
219
+ base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
220
+ base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
221
+ base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
222
+ base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
223
+ base["away_goals_mean"] / base["away_rank_mean"])
224
+ base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
225
+ base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
226
+ base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
227
+ base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
228
+ "away_game_points_rank_mean_l5"]
229
+
230
+ model_df = base[
231
+ ["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
232
+ "goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
233
+ "dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
234
+ return model_df
235
+
236
+
237
+ def no_draw(x):
238
+ """
239
+
240
+ :param x:
241
+ :return:
242
+ """
243
+ if x == 2:
244
+ return 1
245
+ else:
246
+ return x
ml/model.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Text
3
+
4
+ import lightgbm as lgb
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import xgboost as xgb
8
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, \
11
+ classification_report
12
+ from sklearn.model_selection import GridSearchCV
13
+ from sklearn.neural_network import MLPClassifier
14
+ from sklearn.tree import DecisionTreeClassifier
15
+
16
+ from configs.constants import SUPPORT_MODEL, DEFAULT_MODEL
17
+ from ml.data_prepare import data_preparing, create_dataset
18
+
19
+
20
+ def plot_roc_cur(fper, tper):
21
+ """
22
+ PLot the ROC
23
+ :param fper:
24
+ :param tper:
25
+ """
26
+ plt.plot(fper, tper, color='orange', label='ROC')
27
+ plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
28
+ plt.xlabel('False Positive Rate')
29
+ plt.ylabel('True Positive Rate')
30
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
31
+ plt.legend()
32
+ plt.show()
33
+
34
+
35
+ class MLModel:
36
+ """
37
+ WC predictor model
38
+ """
39
+
40
+ def __init__(self, model_type: Text):
41
+
42
+ assert model_type in SUPPORT_MODEL, \
43
+ "Not support the kind of model. Please choose one of {}".format(SUPPORT_MODEL)
44
+ self.model_type = model_type
45
+ if self.model_type == "LogisticRegression":
46
+ self.model = self.get_logistic_regression_model()
47
+ elif self.model_type == "DecisionTreeClassifier":
48
+ self.model = self.get_decision_tree_model()
49
+ elif self.model_type == "MLPClassifier":
50
+ self.model = self.get_neural_network_model()
51
+ elif self.model_type == "RandomForestClassifier":
52
+ self.model = self.get_random_forest_model()
53
+ elif self.model_type == "GradientBoostingClassifier":
54
+ self.model = self.get_gradient_boosting_model()
55
+ elif self.model_type == "LGBMClassifier":
56
+ self.model = self.get_light_gbm_model()
57
+ elif self.model_type == "XGBClassifier":
58
+ self.model = self.get_xgboost_model()
59
+
60
+ def predict_proba(self, x):
61
+ """
62
+ Call predict_proba on the estimator with the best found parameters.
63
+ :return:
64
+ """
65
+ return self.model.predict_proba(x)
66
+
67
+ @staticmethod
68
+ def __run_model(model, x_train, y_train, x_test, y_test, verbose=True):
69
+ t0 = time.time()
70
+ if verbose is False:
71
+ model.fit(x_train.values, np.ravel(y_train), verbose=0)
72
+ else:
73
+ model.fit(x_train.values, np.ravel(y_train))
74
+ model = model.best_estimator_
75
+ y_pred = model.predict(x_test)
76
+ accuracy = accuracy_score(y_test.values, y_pred)
77
+ roc_auc = roc_auc_score(y_test, model.predict_proba(x_test.values)[:, 1])
78
+ coh_kap = cohen_kappa_score(y_test, y_pred)
79
+ time_taken = time.time() - t0
80
+ print("Accuracy : {}".format(accuracy))
81
+ print("ROC Area under Curve : {}".format(roc_auc))
82
+ print("Cohen's Kappa : {}".format(coh_kap))
83
+ print("Time taken : {}".format(time_taken))
84
+ print(classification_report(y_test, y_pred, digits=5))
85
+
86
+ return model, accuracy, roc_auc, coh_kap, time_taken
87
+
88
+ @staticmethod
89
+ def get_logistic_regression_model(**params_lr):
90
+ """
91
+ Return a logistic regression model
92
+ :return:
93
+ """
94
+ if not all(params_lr.values()):
95
+ params_lr = {
96
+ "C": np.logspace(-3, 3, 7),
97
+ "penalty": ["l1", "l2"],
98
+ 'solver': 'liblinear'
99
+ }
100
+
101
+ model_lr = LogisticRegression()
102
+ model_lr = GridSearchCV(model_lr, params_lr, cv=3, verbose=False, scoring='roc_auc', refit=True)
103
+ return model_lr
104
+
105
+ @staticmethod
106
+ def get_decision_tree_model(**params):
107
+ """
108
+ Return a decision tree model
109
+ :return:
110
+ """
111
+ if not all(params.values()):
112
+ params = {'max_features': ['auto', 'sqrt', 'log2'],
113
+ 'ccp_alpha': [0.1, .01, .001],
114
+ 'max_depth': [5, 6, 7, 8, 9],
115
+ 'criterion': ['gini', 'entropy']
116
+ }
117
+
118
+ model = DecisionTreeClassifier()
119
+ model = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=False, scoring='roc_auc', refit=True)
120
+ return model
121
+
122
+ @staticmethod
123
+ def get_neural_network_model(**params_nn):
124
+ """
125
+ Return a neutral network model
126
+ :return:
127
+ """
128
+ if not all(params_nn.values()):
129
+ params_nn = {'solver': ['lbfgs'],
130
+ 'max_iter': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
131
+ 'alpha': 10.0 ** -np.arange(1, 10),
132
+ 'hidden_layer_sizes': np.arange(10, 15),
133
+ 'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
134
+
135
+ model_nn = MLPClassifier()
136
+ model_nn = GridSearchCV(model_nn, params_nn, n_jobs=-1, scoring='roc_auc', refit=True, verbose=False)
137
+ return model_nn
138
+
139
+ @staticmethod
140
+ def get_random_forest_model(**params_rf):
141
+ """
142
+ Return a random forest model
143
+ :return:
144
+ """
145
+ if not all(params_rf.values()):
146
+ params_rf = {"max_depth": [20],
147
+ "min_samples_split": [10],
148
+ "max_leaf_nodes": [175],
149
+ "min_samples_leaf": [5],
150
+ "n_estimators": [250],
151
+ "max_features": ["sqrt"],
152
+ }
153
+
154
+ model_rf = RandomForestClassifier()
155
+ model_rf = GridSearchCV(model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring='roc_auc', refit=True)
156
+
157
+ return model_rf
158
+
159
+ @staticmethod
160
+ def get_light_gbm_model(**params_lgb):
161
+ """
162
+ Return a LightGBM model
163
+ :return:
164
+ """
165
+ if not all(params_lgb.values()):
166
+ params_lgb = {
167
+ 'learning_rate': [0.005, 0.01],
168
+ 'n_estimators': [8, 16, 24],
169
+ 'num_leaves': [6, 8, 12, 16], # large num_leaves helps improve accuracy but might lead to over-fitting
170
+ 'boosting_type': ['gbdt', 'dart'], # for better accuracy -> try dart
171
+ 'objective': ['binary'],
172
+ 'max_bin': [255, 510], # large max_bin helps improve accuracy but might slow down training progress
173
+ 'random_state': [500],
174
+ 'colsample_bytree': [0.64, 0.65, 0.66],
175
+ 'subsample': [0.7, 0.75],
176
+ 'reg_alpha': [1, 1.2],
177
+ 'reg_lambda': [1, 1.2, 1.4],
178
+ }
179
+
180
+ model = lgb.LGBMClassifier()
181
+ model = GridSearchCV(model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring='roc_auc', refit=True)
182
+
183
+ return model
184
+
185
+ @staticmethod
186
+ def get_xgboost_model(**params_xgb):
187
+ """
188
+ Return a xgboost model
189
+ :return:
190
+ """
191
+ if not all(params_xgb.values()):
192
+ params_xgb = {
193
+ 'nthread': [4], # when use hyper thread, xgboost may become slower
194
+ 'objective': ['binary:logistic'],
195
+ 'learning_rate': [0.05], # so called `eta` value
196
+ 'max_depth': [6],
197
+ 'min_child_weight': [11],
198
+ 'silent': [1],
199
+ 'subsample': [0.8],
200
+ 'colsample_bytree': [0.7],
201
+ 'n_estimators': [100], # number of trees, change it to 1000 for better results
202
+ 'missing': [-999],
203
+ 'seed': [1337]
204
+ }
205
+ model = GridSearchCV(xgb.XGBClassifier(), params_xgb, n_jobs=-1,
206
+ cv=3,
207
+ scoring='roc_auc',
208
+ refit=True)
209
+
210
+ return model
211
+
212
+ def fit_and_eval_model(self, x_train, x_test, y_train, y_test):
213
+ """
214
+ Run the model with dataset
215
+ :param x_train:
216
+ :param x_test:
217
+ :param y_train:
218
+ :param y_test:
219
+ :return:
220
+ """
221
+ model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = \
222
+ self.__run_model(self.model, x_train, y_train, x_test, y_test)
223
+ return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
224
+
225
+ @staticmethod
226
+ def get_gradient_boosting_model(**params):
227
+ """
228
+ Return gradient boosting model
229
+ :param params:
230
+ :return:
231
+ """
232
+ if not all(params.values()):
233
+ params = {"learning_rate": [0.01, 0.02, 0.03],
234
+ "min_samples_split": [5, 10],
235
+ "min_samples_leaf": [3, 5],
236
+ "max_depth": [3, 5, 10],
237
+ "max_features": ["sqrt"],
238
+ "n_estimators": [100, 200]
239
+ }
240
+ model = GradientBoostingClassifier(random_state=100)
241
+ return GridSearchCV(model, params, cv=3, n_jobs=-1)
242
+
243
+
244
+ base_df, data_df = data_preparing()
245
+ x_train, x_test, y_train, y_test = create_dataset(data_df)
246
+ ml_model = MLModel(DEFAULT_MODEL)
247
+ ml_model.fit_and_eval_model(x_train, x_test, y_train, y_test)
ml/predictor.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ from operator import itemgetter
3
+ from typing import Text, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from configs.config import cfg
9
+ from configs.constants import DATA_ROOT
10
+ from ml.model import MLModel
11
+ from ml.utils import load_pickle
12
+
13
+
14
+ class Predictor:
15
+ """
16
+ A match predictor using ML
17
+ """
18
+
19
+ def __init__(self, base_df: pd.DataFrame, model: MLModel):
20
+ self.model = model
21
+ self.base_df = base_df
22
+
23
+ def find_stats(self, team):
24
+ """
25
+
26
+ :param team: Name of the team, eg: Qatar, etc.
27
+ :return:
28
+ """
29
+
30
+ last_game = self.base_df[(self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)].tail(1)
31
+
32
+ if last_game["home_team"].values[0] == team:
33
+ team_rank = last_game["rank_home"].values[0]
34
+ team_goals = last_game["home_goals_mean"].values[0]
35
+ team_goals_l5 = last_game["home_goals_mean_l5"].values[0]
36
+ team_goals_suf = last_game["home_goals_suf_mean"].values[0]
37
+ team_goals_suf_l5 = last_game["home_goals_suf_mean_l5"].values[0]
38
+ team_rank_suf = last_game["home_rank_mean"].values[0]
39
+ team_rank_suf_l5 = last_game["home_rank_mean_l5"].values[0]
40
+ team_gp_rank = last_game["home_game_points_rank_mean"].values[0]
41
+ team_gp_rank_l5 = last_game["home_game_points_rank_mean_l5"].values[0]
42
+ else:
43
+ team_rank = last_game["rank_away"].values[0]
44
+ team_goals = last_game["away_goals_mean"].values[0]
45
+ team_goals_l5 = last_game["away_goals_mean_l5"].values[0]
46
+ team_goals_suf = last_game["away_goals_suf_mean"].values[0]
47
+ team_goals_suf_l5 = last_game["away_goals_suf_mean_l5"].values[0]
48
+ team_rank_suf = last_game["away_rank_mean"].values[0]
49
+ team_rank_suf_l5 = last_game["away_rank_mean_l5"].values[0]
50
+ team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
51
+ team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
52
+
53
+ return [team_rank, team_goals, team_goals_l5, team_goals_suf, team_goals_suf_l5, team_rank_suf,
54
+ team_rank_suf_l5, team_gp_rank, team_gp_rank_l5]
55
+
56
+ @staticmethod
57
+ def find_features(team_1, team_2):
58
+ """
59
+
60
+ :param team_1:
61
+ :param team_2:
62
+ :return:
63
+ """
64
+ rank_dif = team_1[0] - team_2[0]
65
+ goals_dif = team_1[1] - team_2[1]
66
+ goals_dif_l5 = team_1[2] - team_2[2]
67
+ goals_suf_dif = team_1[3] - team_2[3]
68
+ goals_suf_dif_l5 = team_1[4] - team_2[4]
69
+ goals_per_ranking_dif = (team_1[1] / team_1[5]) - (team_2[1] / team_2[5])
70
+ dif_rank_agst = team_1[5] - team_2[5]
71
+ dif_rank_agst_l5 = team_1[6] - team_2[6]
72
+ dif_gp_rank = team_1[7] - team_2[7]
73
+ dif_gp_rank_l5 = team_1[8] - team_2[8]
74
+
75
+ return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif,
76
+ dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]
77
+
78
+ def __predict(self, team_1: Text, team_2: Text):
79
+
80
+ team_1_stat = self.find_stats(team_1)
81
+ team_2_stat = self.find_stats(team_2)
82
+
83
+ features_g1 = self.find_features(team_1_stat, team_2_stat)
84
+ features_g2 = self.find_features(team_2_stat, team_1_stat)
85
+
86
+ probs_g1 = self.model.predict_proba([features_g1])
87
+ probs_g2 = self.model.predict_proba([features_g2])
88
+ team_1_prob_g1 = probs_g1[0][0]
89
+ team_1_prob_g2 = probs_g2[0][1]
90
+ team_2_prob_g1 = probs_g1[0][1]
91
+ team_2_prob_g2 = probs_g2[0][0]
92
+
93
+ team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
94
+ team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
95
+
96
+ return team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2
97
+
98
+ def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
99
+ """
100
+
101
+ :param team_1:
102
+ :param team_2:
103
+ :return:
104
+ """
105
+ draw = False
106
+ team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
107
+ team_1, team_2)
108
+ winner, winner_proba = "", 0.0
109
+ if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
110
+ (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
111
+ draw = True
112
+
113
+ elif team_1_prob > team_2_prob:
114
+ winner = team_1
115
+ winner_proba = team_1_prob
116
+
117
+ elif team_2_prob > team_1_prob:
118
+ winner = team_2
119
+ winner_proba = team_2_prob
120
+ return draw, winner, winner_proba
121
+
122
+ def predict_all_matches(self) -> Text:
123
+ """
124
+ Predict all the matches in the tournament
125
+ :return:
126
+ """
127
+ result = ""
128
+ data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
129
+ table = data['table']
130
+ matches = data['matches']
131
+ advanced_group, last_group = [], ""
132
+
133
+ for teams in matches:
134
+ draw = False
135
+ team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
136
+ teams[1], teams[2])
137
+ winner, winner_proba = "", 0.0
138
+ if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
139
+ (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
140
+ draw = True
141
+ for i in table[teams[0]]:
142
+ if i[0] == teams[1] or i[0] == teams[2]:
143
+ i[1] += 1
144
+
145
+ elif team_1_prob > team_2_prob:
146
+ winner = teams[1]
147
+ winner_proba = team_1_prob
148
+ for i in table[teams[0]]:
149
+ if i[0] == teams[1]:
150
+ i[1] += 3
151
+
152
+ elif team_2_prob > team_1_prob:
153
+ winner = teams[2]
154
+ winner_proba = team_2_prob
155
+ for i in table[teams[0]]:
156
+ if i[0] == teams[2]:
157
+ i[1] += 3
158
+
159
+ for i in table[teams[0]]: # adding tiebreaker (probs per game)
160
+ if i[0] == teams[1]:
161
+ i[2].append(team_1_prob)
162
+ if i[0] == teams[2]:
163
+ i[2].append(team_2_prob)
164
+
165
+ if last_group != teams[0]:
166
+ if last_group != "":
167
+ result += "\n"
168
+ result += "Group %s advanced: \n" % last_group
169
+ for i in table[last_group]: # adding tiebreaker
170
+ i[2] = np.mean(i[2])
171
+
172
+ final_points = table[last_group]
173
+ final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
174
+ advanced_group.append([final_table[0][0], final_table[1][0]])
175
+ for i in final_table:
176
+ result += "%s -------- %d\n" % (i[0], i[1])
177
+ result += "\n"
178
+ result += "-" * 10 + " Starting Analysis for Group %s " % (teams[0]) + "-" * 10 + "\n"
179
+
180
+ if draw is False:
181
+ result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
182
+ teams[0], teams[1], teams[2], winner, winner_proba)
183
+ else:
184
+ result += "Group %s - %s vs. %s: Draw\n" % (teams[0], teams[1], teams[2])
185
+ last_group = teams[0]
186
+ result += "\n"
187
+ result += "Group %s advanced: \n" % last_group
188
+
189
+ for i in table[last_group]: # adding tiebreaker
190
+ i[2] = np.mean(i[2])
191
+
192
+ final_points = table[last_group]
193
+ final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
194
+ advanced_group.append([final_table[0][0], final_table[1][0]])
195
+ for i in final_table:
196
+ result += "%s -------- %d\n" % (i[0], i[1])
197
+
198
+ advanced = advanced_group
199
+ playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}
200
+
201
+ for p in playoffs.keys():
202
+ playoffs[p] = []
203
+
204
+ actual_round = ""
205
+ next_rounds = []
206
+
207
+ for p in playoffs.keys():
208
+ if p == "Round of 16":
209
+ control = []
210
+ for a in range(0, len(advanced * 2), 1):
211
+ if a < len(advanced):
212
+ if a % 2 == 0:
213
+ control.append((advanced * 2)[a][0])
214
+ else:
215
+ control.append((advanced * 2)[a][1])
216
+ else:
217
+ if a % 2 == 0:
218
+ control.append((advanced * 2)[a][1])
219
+ else:
220
+ control.append((advanced * 2)[a][0])
221
+ playoffs[p] = [[control[c], control[c + 1]] for c in range(0, len(control) - 1, 1) if c % 2 == 0]
222
+
223
+ for i in range(0, len(playoffs[p]), 1):
224
+ game = playoffs[p][i]
225
+
226
+ home = game[0]
227
+ away = game[1]
228
+
229
+ team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
230
+ self.__predict(home, away)
231
+ if actual_round != p:
232
+ result += "-" * 10 + "\n"
233
+ result += "Starting simulation of %s\n" % p
234
+ result += "-" * 10 + "\n"
235
+
236
+ if team_1_prob < team_2_prob:
237
+ result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, away, team_2_prob)
238
+ next_rounds.append(away)
239
+ else:
240
+ result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, home, team_1_prob)
241
+ next_rounds.append(home)
242
+
243
+ game.append([team_1_prob, team_2_prob])
244
+ playoffs[p][i] = game
245
+ actual_round = p
246
+
247
+ else:
248
+ playoffs[p] = [[next_rounds[c], next_rounds[c + 1]] for c in range(0, len(next_rounds) - 1, 1) if
249
+ c % 2 == 0]
250
+ next_rounds = []
251
+ for i in range(0, len(playoffs[p])):
252
+ game = playoffs[p][i]
253
+ home = game[0]
254
+ away = game[1]
255
+
256
+ team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
257
+ self.__predict(home, away)
258
+ if actual_round != p:
259
+ result += "-" * 10 + "\n"
260
+ result += "Starting simulation of %s\n" % p
261
+ result += "-" * 10 + "\n"
262
+
263
+ if team_1_prob < team_2_prob:
264
+ result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, away, team_2_prob)
265
+ next_rounds.append(away)
266
+ else:
267
+ result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, home, team_1_prob)
268
+ next_rounds.append(home)
269
+ game.append([team_1_prob, team_2_prob])
270
+ playoffs[p][i] = game
271
+ actual_round = p
272
+
273
+ print(result)
274
+ return result
ml/utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+
4
+ def write_pickle(path, a):
5
+ """
6
+
7
+ Args:
8
+ path: The path to storge *.pkl file
9
+ a: An object
10
+
11
+ Returns:
12
+
13
+ """
14
+ try:
15
+ with open(path, 'wb') as handle:
16
+ pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
17
+ return True
18
+ except Exception as e:
19
+ print(e)
20
+ return False
21
+
22
+
23
+ def load_pickle(path):
24
+ """
25
+
26
+ Args:
27
+ path:
28
+
29
+ Returns:
30
+
31
+ """
32
+ with open(path, 'rb') as handle:
33
+ data = pickle.load(handle)
34
+ return data
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ lightgbm~=3.3.3
2
+ matplotlib~=3.6.2
3
+ pandas~=1.5.1
4
+ xgboost~=1.7.1
5
+ sklearn~=0.0.post1
6
+ scikit-learn~=1.1.3
7
+ omegaconf~=2.2.3
8
+ numpy~=1.23.5
9
+ Flask~=2.2.2
10
+ gradio~=3.10.1