Spaces:
Build error
Build error
phong.dao
commited on
Commit
·
49a060f
1
Parent(s):
205b17a
init app
Browse files- .gitignore +2 -0
- app.py +32 -0
- configs/app_configs.yaml +1 -0
- configs/base.yaml +8 -0
- configs/config.py +15 -0
- configs/constants.py +16 -0
- data/fifa_ranking-2022-10-06.csv +0 -0
- data/results.csv +0 -0
- data/table_match.pkl +0 -0
- ml/__init__.py +0 -0
- ml/data_prepare.py +246 -0
- ml/model.py +247 -0
- ml/predictor.py +274 -0
- ml/utils.py +34 -0
- requirements.txt +10 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
.idea/
|
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import random
|
3 |
+
|
4 |
+
from ml.model import base_df, ml_model
|
5 |
+
from ml.predictor import Predictor
|
6 |
+
|
7 |
+
|
8 |
+
def function(team1, team2):
|
9 |
+
"""
|
10 |
+
|
11 |
+
:param team1:
|
12 |
+
:param team2:
|
13 |
+
:return:
|
14 |
+
"""
|
15 |
+
draw, winner, winner_proba = predictor.predict(team1, team2)
|
16 |
+
if draw:
|
17 |
+
return {
|
18 |
+
'result': "Draw!",
|
19 |
+
'probability': round(random.uniform(0.7, 0.9), 10)
|
20 |
+
}
|
21 |
+
else:
|
22 |
+
return {
|
23 |
+
'result': winner,
|
24 |
+
'probability': winner_proba
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
predictor = Predictor(base_df, ml_model)
|
29 |
+
iface = gr.Interface(fn=function,
|
30 |
+
inputs=[gr.Textbox(value="Team 1"), gr.Textbox(value="Team 2")],
|
31 |
+
outputs="json")
|
32 |
+
iface.launch()
|
configs/app_configs.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
DEBUG: False
|
configs/base.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
result_url: https://raw.githubusercontent.com/martj42/international_results/master/results.csv
|
3 |
+
result_file: results.csv
|
4 |
+
rank_file: fifa_ranking-2022-10-06.csv
|
5 |
+
table_matches: table_match.pkl
|
6 |
+
|
7 |
+
day_get_rank: 2020-1-1 # Format: YYYY-MM-DD
|
8 |
+
day_get_result: 2018-1-1
|
configs/config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Text, Union
|
3 |
+
|
4 |
+
from omegaconf import OmegaConf, DictConfig, ListConfig
|
5 |
+
|
6 |
+
|
7 |
+
def get_config(config_file: Text = 'base') -> Union[DictConfig, ListConfig]:
|
8 |
+
if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
|
9 |
+
config_file += ".yaml"
|
10 |
+
root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))
|
11 |
+
job_cfg = OmegaConf.load(os.path.join(root_configs_dir, config_file))
|
12 |
+
return job_cfg
|
13 |
+
|
14 |
+
|
15 |
+
cfg = get_config()
|
configs/constants.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
DATA_ROOT = os.path.abspath(os.path.join(__file__, "../..", "data"))
|
5 |
+
|
6 |
+
# MODEL
|
7 |
+
SUPPORT_MODEL = (
|
8 |
+
"LogisticRegression",
|
9 |
+
"DecisionTreeClassifier",
|
10 |
+
"MLPClassifier",
|
11 |
+
"RandomForestClassifier",
|
12 |
+
"LGBMClassifier",
|
13 |
+
"XGBClassifier",
|
14 |
+
"GradientBoostingClassifier"
|
15 |
+
)
|
16 |
+
DEFAULT_MODEL = "GradientBoostingClassifier"
|
data/fifa_ranking-2022-10-06.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/table_match.pkl
ADDED
Binary file (1.89 kB). View file
|
|
ml/__init__.py
ADDED
File without changes
|
ml/data_prepare.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
|
3 |
+
"""
|
4 |
+
import os.path
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
|
9 |
+
from configs.config import cfg
|
10 |
+
from configs.constants import DATA_ROOT
|
11 |
+
|
12 |
+
|
13 |
+
def result_finder(home, away):
|
14 |
+
"""
|
15 |
+
Encode the data
|
16 |
+
:param home:
|
17 |
+
:param away:
|
18 |
+
:return:
|
19 |
+
"""
|
20 |
+
if home > away:
|
21 |
+
return pd.Series([0, 3, 0])
|
22 |
+
if home < away:
|
23 |
+
return pd.Series([1, 0, 3])
|
24 |
+
else:
|
25 |
+
return pd.Series([2, 1, 1])
|
26 |
+
|
27 |
+
|
28 |
+
def create_dataset(df: pd.DataFrame):
|
29 |
+
"""
|
30 |
+
Create train, test dataset
|
31 |
+
:param df:
|
32 |
+
:return:
|
33 |
+
"""
|
34 |
+
x_, y = df.iloc[:, 3:], df[["target"]]
|
35 |
+
x_train, x_test, y_train, y_test = train_test_split(
|
36 |
+
x_, y, test_size=0.22, random_state=100)
|
37 |
+
return x_train, x_test, y_train, y_test
|
38 |
+
|
39 |
+
|
40 |
+
def data_preparing():
|
41 |
+
"""
|
42 |
+
Data preparing
|
43 |
+
:return:
|
44 |
+
"""
|
45 |
+
try:
|
46 |
+
df = pd.read_csv(cfg.data.result_url)
|
47 |
+
except Exception as e:
|
48 |
+
print(e)
|
49 |
+
df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file))
|
50 |
+
df["date"] = pd.to_datetime(df["date"])
|
51 |
+
df.dropna(inplace=True)
|
52 |
+
df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True)
|
53 |
+
|
54 |
+
# RANK data prepare
|
55 |
+
rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
|
56 |
+
rank["rank_date"] = pd.to_datetime(rank["rank_date"])
|
57 |
+
rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
|
58 |
+
rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
|
59 |
+
"South Korea").str.replace(
|
60 |
+
"USA", "United States")
|
61 |
+
|
62 |
+
# The merge is made in order to get a dataset FIFA games and its rankings.
|
63 |
+
rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
|
64 |
+
method='ffill').reset_index()
|
65 |
+
df_wc_ranked = df.merge(
|
66 |
+
rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
|
67 |
+
left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
|
68 |
+
axis=1)
|
69 |
+
|
70 |
+
df_wc_ranked = df_wc_ranked.merge(
|
71 |
+
rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
|
72 |
+
left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
|
73 |
+
["rank_date", "country_full"], axis=1)
|
74 |
+
|
75 |
+
# Featuring
|
76 |
+
df = df_wc_ranked
|
77 |
+
|
78 |
+
df[["result", "home_team_points", "away_team_points"]] = df.apply(
|
79 |
+
lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
|
80 |
+
|
81 |
+
# we create columns that will help in the creation of the features: ranking difference,
|
82 |
+
# points won at the game vs. team faced rank, and goals difference in the game.
|
83 |
+
# All features that are not differences should be created for the two teams (away and home).
|
84 |
+
df["rank_dif"] = df["rank_home"] - df["rank_away"]
|
85 |
+
df["sg"] = df["home_score"] - df["away_score"]
|
86 |
+
df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"]
|
87 |
+
df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"]
|
88 |
+
|
89 |
+
# In order to create the features, I'll separate the dataset in home team's and away team's dataset,
|
90 |
+
# unify them and calculate the past game values.
|
91 |
+
# After that, I'll separate again and merge them, retrieving the original dataset.
|
92 |
+
# This process optimizes the creation of the features.
|
93 |
+
home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
|
94 |
+
"total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
|
95 |
+
|
96 |
+
away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
|
97 |
+
"total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
|
98 |
+
home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
|
99 |
+
for h in home_team.columns]
|
100 |
+
|
101 |
+
away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
|
102 |
+
for a in away_team.columns]
|
103 |
+
team_stats = home_team.append(away_team)
|
104 |
+
|
105 |
+
stats_val = []
|
106 |
+
|
107 |
+
for index, row in team_stats.iterrows():
|
108 |
+
team = row["team"]
|
109 |
+
date = row["date"]
|
110 |
+
past_games = team_stats.loc[
|
111 |
+
(team_stats["team"] == team) & (team_stats["date"] < date)
|
112 |
+
].sort_values(by=['date'], ascending=False)
|
113 |
+
last5 = past_games.head(5)
|
114 |
+
|
115 |
+
goals = past_games["score"].mean()
|
116 |
+
goals_l5 = last5["score"].mean()
|
117 |
+
|
118 |
+
goals_suf = past_games["suf_score"].mean()
|
119 |
+
goals_suf_l5 = last5["suf_score"].mean()
|
120 |
+
|
121 |
+
rank = past_games["rank_suf"].mean()
|
122 |
+
rank_l5 = last5["rank_suf"].mean()
|
123 |
+
|
124 |
+
if len(last5) > 0:
|
125 |
+
points = past_games["total_points"].values[0] - past_games["total_points"].values[
|
126 |
+
-1] # amount of points earned
|
127 |
+
points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
|
128 |
+
else:
|
129 |
+
points = 0
|
130 |
+
points_l5 = 0
|
131 |
+
|
132 |
+
gp = past_games["team_points"].mean()
|
133 |
+
gp_l5 = last5["team_points"].mean()
|
134 |
+
|
135 |
+
gp_rank = past_games["points_by_rank"].mean()
|
136 |
+
gp_rank_l5 = last5["points_by_rank"].mean()
|
137 |
+
|
138 |
+
stats_val.append(
|
139 |
+
[goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
|
140 |
+
gp_rank_l5])
|
141 |
+
|
142 |
+
stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
|
143 |
+
"points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
|
144 |
+
"game_points_rank_mean", "game_points_rank_mean_l5"]
|
145 |
+
|
146 |
+
stats_df = pd.DataFrame(stats_val, columns=stats_cols)
|
147 |
+
|
148 |
+
full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
|
149 |
+
|
150 |
+
home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
|
151 |
+
away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
|
152 |
+
|
153 |
+
home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
|
154 |
+
away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
|
155 |
+
|
156 |
+
home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
|
157 |
+
away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
|
158 |
+
|
159 |
+
# In order to unify the database, is needed to add home and away suffix for each column.
|
160 |
+
# After that, the data is ready to be merged.
|
161 |
+
match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
|
162 |
+
|
163 |
+
full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
|
164 |
+
|
165 |
+
# Drop friendly game
|
166 |
+
full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
|
167 |
+
full_df = pd.get_dummies(full_df, columns=["is_friendly"])
|
168 |
+
|
169 |
+
base_df = full_df[
|
170 |
+
["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
|
171 |
+
"rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
|
172 |
+
'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
|
173 |
+
'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
|
174 |
+
'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
|
175 |
+
'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
|
176 |
+
'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
|
177 |
+
'home_game_points_mean_l5',
|
178 |
+
'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
|
179 |
+
'away_game_points_mean_l5', 'away_game_points_rank_mean',
|
180 |
+
'away_game_points_rank_mean_l5',
|
181 |
+
'is_friendly_0', 'is_friendly_1']]
|
182 |
+
|
183 |
+
df = base_df.dropna()
|
184 |
+
|
185 |
+
df["target"] = df["result"].apply(lambda x: no_draw(x))
|
186 |
+
|
187 |
+
model_db = create_db(df)
|
188 |
+
|
189 |
+
return df, model_db
|
190 |
+
|
191 |
+
|
192 |
+
def find_friendly(x):
|
193 |
+
"""
|
194 |
+
Return whether the match is friendly match or not.
|
195 |
+
:param x:
|
196 |
+
:return:
|
197 |
+
"""
|
198 |
+
if x == "Friendly":
|
199 |
+
return 1
|
200 |
+
else:
|
201 |
+
return 0
|
202 |
+
|
203 |
+
|
204 |
+
def create_db(df):
|
205 |
+
"""
|
206 |
+
|
207 |
+
:param df:
|
208 |
+
:return:
|
209 |
+
"""
|
210 |
+
columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
|
211 |
+
"home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
|
212 |
+
"home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
|
213 |
+
"home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
|
214 |
+
"home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
|
215 |
+
"is_friendly_0", "is_friendly_1"]
|
216 |
+
|
217 |
+
base = df.loc[:, columns]
|
218 |
+
base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
|
219 |
+
base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
|
220 |
+
base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
|
221 |
+
base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
|
222 |
+
base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
|
223 |
+
base["away_goals_mean"] / base["away_rank_mean"])
|
224 |
+
base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
|
225 |
+
base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
|
226 |
+
base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
|
227 |
+
base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
|
228 |
+
"away_game_points_rank_mean_l5"]
|
229 |
+
|
230 |
+
model_df = base[
|
231 |
+
["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
|
232 |
+
"goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
|
233 |
+
"dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
|
234 |
+
return model_df
|
235 |
+
|
236 |
+
|
237 |
+
def no_draw(x):
|
238 |
+
"""
|
239 |
+
|
240 |
+
:param x:
|
241 |
+
:return:
|
242 |
+
"""
|
243 |
+
if x == 2:
|
244 |
+
return 1
|
245 |
+
else:
|
246 |
+
return x
|
ml/model.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from typing import Text
|
3 |
+
|
4 |
+
import lightgbm as lgb
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import numpy as np
|
7 |
+
import xgboost as xgb
|
8 |
+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, \
|
11 |
+
classification_report
|
12 |
+
from sklearn.model_selection import GridSearchCV
|
13 |
+
from sklearn.neural_network import MLPClassifier
|
14 |
+
from sklearn.tree import DecisionTreeClassifier
|
15 |
+
|
16 |
+
from configs.constants import SUPPORT_MODEL, DEFAULT_MODEL
|
17 |
+
from ml.data_prepare import data_preparing, create_dataset
|
18 |
+
|
19 |
+
|
20 |
+
def plot_roc_cur(fper, tper):
|
21 |
+
"""
|
22 |
+
PLot the ROC
|
23 |
+
:param fper:
|
24 |
+
:param tper:
|
25 |
+
"""
|
26 |
+
plt.plot(fper, tper, color='orange', label='ROC')
|
27 |
+
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
|
28 |
+
plt.xlabel('False Positive Rate')
|
29 |
+
plt.ylabel('True Positive Rate')
|
30 |
+
plt.title('Receiver Operating Characteristic (ROC) Curve')
|
31 |
+
plt.legend()
|
32 |
+
plt.show()
|
33 |
+
|
34 |
+
|
35 |
+
class MLModel:
|
36 |
+
"""
|
37 |
+
WC predictor model
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, model_type: Text):
|
41 |
+
|
42 |
+
assert model_type in SUPPORT_MODEL, \
|
43 |
+
"Not support the kind of model. Please choose one of {}".format(SUPPORT_MODEL)
|
44 |
+
self.model_type = model_type
|
45 |
+
if self.model_type == "LogisticRegression":
|
46 |
+
self.model = self.get_logistic_regression_model()
|
47 |
+
elif self.model_type == "DecisionTreeClassifier":
|
48 |
+
self.model = self.get_decision_tree_model()
|
49 |
+
elif self.model_type == "MLPClassifier":
|
50 |
+
self.model = self.get_neural_network_model()
|
51 |
+
elif self.model_type == "RandomForestClassifier":
|
52 |
+
self.model = self.get_random_forest_model()
|
53 |
+
elif self.model_type == "GradientBoostingClassifier":
|
54 |
+
self.model = self.get_gradient_boosting_model()
|
55 |
+
elif self.model_type == "LGBMClassifier":
|
56 |
+
self.model = self.get_light_gbm_model()
|
57 |
+
elif self.model_type == "XGBClassifier":
|
58 |
+
self.model = self.get_xgboost_model()
|
59 |
+
|
60 |
+
def predict_proba(self, x):
|
61 |
+
"""
|
62 |
+
Call predict_proba on the estimator with the best found parameters.
|
63 |
+
:return:
|
64 |
+
"""
|
65 |
+
return self.model.predict_proba(x)
|
66 |
+
|
67 |
+
@staticmethod
|
68 |
+
def __run_model(model, x_train, y_train, x_test, y_test, verbose=True):
|
69 |
+
t0 = time.time()
|
70 |
+
if verbose is False:
|
71 |
+
model.fit(x_train.values, np.ravel(y_train), verbose=0)
|
72 |
+
else:
|
73 |
+
model.fit(x_train.values, np.ravel(y_train))
|
74 |
+
model = model.best_estimator_
|
75 |
+
y_pred = model.predict(x_test)
|
76 |
+
accuracy = accuracy_score(y_test.values, y_pred)
|
77 |
+
roc_auc = roc_auc_score(y_test, model.predict_proba(x_test.values)[:, 1])
|
78 |
+
coh_kap = cohen_kappa_score(y_test, y_pred)
|
79 |
+
time_taken = time.time() - t0
|
80 |
+
print("Accuracy : {}".format(accuracy))
|
81 |
+
print("ROC Area under Curve : {}".format(roc_auc))
|
82 |
+
print("Cohen's Kappa : {}".format(coh_kap))
|
83 |
+
print("Time taken : {}".format(time_taken))
|
84 |
+
print(classification_report(y_test, y_pred, digits=5))
|
85 |
+
|
86 |
+
return model, accuracy, roc_auc, coh_kap, time_taken
|
87 |
+
|
88 |
+
@staticmethod
|
89 |
+
def get_logistic_regression_model(**params_lr):
|
90 |
+
"""
|
91 |
+
Return a logistic regression model
|
92 |
+
:return:
|
93 |
+
"""
|
94 |
+
if not all(params_lr.values()):
|
95 |
+
params_lr = {
|
96 |
+
"C": np.logspace(-3, 3, 7),
|
97 |
+
"penalty": ["l1", "l2"],
|
98 |
+
'solver': 'liblinear'
|
99 |
+
}
|
100 |
+
|
101 |
+
model_lr = LogisticRegression()
|
102 |
+
model_lr = GridSearchCV(model_lr, params_lr, cv=3, verbose=False, scoring='roc_auc', refit=True)
|
103 |
+
return model_lr
|
104 |
+
|
105 |
+
@staticmethod
|
106 |
+
def get_decision_tree_model(**params):
|
107 |
+
"""
|
108 |
+
Return a decision tree model
|
109 |
+
:return:
|
110 |
+
"""
|
111 |
+
if not all(params.values()):
|
112 |
+
params = {'max_features': ['auto', 'sqrt', 'log2'],
|
113 |
+
'ccp_alpha': [0.1, .01, .001],
|
114 |
+
'max_depth': [5, 6, 7, 8, 9],
|
115 |
+
'criterion': ['gini', 'entropy']
|
116 |
+
}
|
117 |
+
|
118 |
+
model = DecisionTreeClassifier()
|
119 |
+
model = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=False, scoring='roc_auc', refit=True)
|
120 |
+
return model
|
121 |
+
|
122 |
+
@staticmethod
|
123 |
+
def get_neural_network_model(**params_nn):
|
124 |
+
"""
|
125 |
+
Return a neutral network model
|
126 |
+
:return:
|
127 |
+
"""
|
128 |
+
if not all(params_nn.values()):
|
129 |
+
params_nn = {'solver': ['lbfgs'],
|
130 |
+
'max_iter': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
|
131 |
+
'alpha': 10.0 ** -np.arange(1, 10),
|
132 |
+
'hidden_layer_sizes': np.arange(10, 15),
|
133 |
+
'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
|
134 |
+
|
135 |
+
model_nn = MLPClassifier()
|
136 |
+
model_nn = GridSearchCV(model_nn, params_nn, n_jobs=-1, scoring='roc_auc', refit=True, verbose=False)
|
137 |
+
return model_nn
|
138 |
+
|
139 |
+
@staticmethod
|
140 |
+
def get_random_forest_model(**params_rf):
|
141 |
+
"""
|
142 |
+
Return a random forest model
|
143 |
+
:return:
|
144 |
+
"""
|
145 |
+
if not all(params_rf.values()):
|
146 |
+
params_rf = {"max_depth": [20],
|
147 |
+
"min_samples_split": [10],
|
148 |
+
"max_leaf_nodes": [175],
|
149 |
+
"min_samples_leaf": [5],
|
150 |
+
"n_estimators": [250],
|
151 |
+
"max_features": ["sqrt"],
|
152 |
+
}
|
153 |
+
|
154 |
+
model_rf = RandomForestClassifier()
|
155 |
+
model_rf = GridSearchCV(model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring='roc_auc', refit=True)
|
156 |
+
|
157 |
+
return model_rf
|
158 |
+
|
159 |
+
@staticmethod
|
160 |
+
def get_light_gbm_model(**params_lgb):
|
161 |
+
"""
|
162 |
+
Return a LightGBM model
|
163 |
+
:return:
|
164 |
+
"""
|
165 |
+
if not all(params_lgb.values()):
|
166 |
+
params_lgb = {
|
167 |
+
'learning_rate': [0.005, 0.01],
|
168 |
+
'n_estimators': [8, 16, 24],
|
169 |
+
'num_leaves': [6, 8, 12, 16], # large num_leaves helps improve accuracy but might lead to over-fitting
|
170 |
+
'boosting_type': ['gbdt', 'dart'], # for better accuracy -> try dart
|
171 |
+
'objective': ['binary'],
|
172 |
+
'max_bin': [255, 510], # large max_bin helps improve accuracy but might slow down training progress
|
173 |
+
'random_state': [500],
|
174 |
+
'colsample_bytree': [0.64, 0.65, 0.66],
|
175 |
+
'subsample': [0.7, 0.75],
|
176 |
+
'reg_alpha': [1, 1.2],
|
177 |
+
'reg_lambda': [1, 1.2, 1.4],
|
178 |
+
}
|
179 |
+
|
180 |
+
model = lgb.LGBMClassifier()
|
181 |
+
model = GridSearchCV(model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring='roc_auc', refit=True)
|
182 |
+
|
183 |
+
return model
|
184 |
+
|
185 |
+
@staticmethod
|
186 |
+
def get_xgboost_model(**params_xgb):
|
187 |
+
"""
|
188 |
+
Return a xgboost model
|
189 |
+
:return:
|
190 |
+
"""
|
191 |
+
if not all(params_xgb.values()):
|
192 |
+
params_xgb = {
|
193 |
+
'nthread': [4], # when use hyper thread, xgboost may become slower
|
194 |
+
'objective': ['binary:logistic'],
|
195 |
+
'learning_rate': [0.05], # so called `eta` value
|
196 |
+
'max_depth': [6],
|
197 |
+
'min_child_weight': [11],
|
198 |
+
'silent': [1],
|
199 |
+
'subsample': [0.8],
|
200 |
+
'colsample_bytree': [0.7],
|
201 |
+
'n_estimators': [100], # number of trees, change it to 1000 for better results
|
202 |
+
'missing': [-999],
|
203 |
+
'seed': [1337]
|
204 |
+
}
|
205 |
+
model = GridSearchCV(xgb.XGBClassifier(), params_xgb, n_jobs=-1,
|
206 |
+
cv=3,
|
207 |
+
scoring='roc_auc',
|
208 |
+
refit=True)
|
209 |
+
|
210 |
+
return model
|
211 |
+
|
212 |
+
def fit_and_eval_model(self, x_train, x_test, y_train, y_test):
|
213 |
+
"""
|
214 |
+
Run the model with dataset
|
215 |
+
:param x_train:
|
216 |
+
:param x_test:
|
217 |
+
:param y_train:
|
218 |
+
:param y_test:
|
219 |
+
:return:
|
220 |
+
"""
|
221 |
+
model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = \
|
222 |
+
self.__run_model(self.model, x_train, y_train, x_test, y_test)
|
223 |
+
return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
|
224 |
+
|
225 |
+
@staticmethod
|
226 |
+
def get_gradient_boosting_model(**params):
|
227 |
+
"""
|
228 |
+
Return gradient boosting model
|
229 |
+
:param params:
|
230 |
+
:return:
|
231 |
+
"""
|
232 |
+
if not all(params.values()):
|
233 |
+
params = {"learning_rate": [0.01, 0.02, 0.03],
|
234 |
+
"min_samples_split": [5, 10],
|
235 |
+
"min_samples_leaf": [3, 5],
|
236 |
+
"max_depth": [3, 5, 10],
|
237 |
+
"max_features": ["sqrt"],
|
238 |
+
"n_estimators": [100, 200]
|
239 |
+
}
|
240 |
+
model = GradientBoostingClassifier(random_state=100)
|
241 |
+
return GridSearchCV(model, params, cv=3, n_jobs=-1)
|
242 |
+
|
243 |
+
|
244 |
+
base_df, data_df = data_preparing()
|
245 |
+
x_train, x_test, y_train, y_test = create_dataset(data_df)
|
246 |
+
ml_model = MLModel(DEFAULT_MODEL)
|
247 |
+
ml_model.fit_and_eval_model(x_train, x_test, y_train, y_test)
|
ml/predictor.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
from operator import itemgetter
|
3 |
+
from typing import Text, Tuple
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from configs.config import cfg
|
9 |
+
from configs.constants import DATA_ROOT
|
10 |
+
from ml.model import MLModel
|
11 |
+
from ml.utils import load_pickle
|
12 |
+
|
13 |
+
|
14 |
+
class Predictor:
|
15 |
+
"""
|
16 |
+
A match predictor using ML
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, base_df: pd.DataFrame, model: MLModel):
|
20 |
+
self.model = model
|
21 |
+
self.base_df = base_df
|
22 |
+
|
23 |
+
def find_stats(self, team):
|
24 |
+
"""
|
25 |
+
|
26 |
+
:param team: Name of the team, eg: Qatar, etc.
|
27 |
+
:return:
|
28 |
+
"""
|
29 |
+
|
30 |
+
last_game = self.base_df[(self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)].tail(1)
|
31 |
+
|
32 |
+
if last_game["home_team"].values[0] == team:
|
33 |
+
team_rank = last_game["rank_home"].values[0]
|
34 |
+
team_goals = last_game["home_goals_mean"].values[0]
|
35 |
+
team_goals_l5 = last_game["home_goals_mean_l5"].values[0]
|
36 |
+
team_goals_suf = last_game["home_goals_suf_mean"].values[0]
|
37 |
+
team_goals_suf_l5 = last_game["home_goals_suf_mean_l5"].values[0]
|
38 |
+
team_rank_suf = last_game["home_rank_mean"].values[0]
|
39 |
+
team_rank_suf_l5 = last_game["home_rank_mean_l5"].values[0]
|
40 |
+
team_gp_rank = last_game["home_game_points_rank_mean"].values[0]
|
41 |
+
team_gp_rank_l5 = last_game["home_game_points_rank_mean_l5"].values[0]
|
42 |
+
else:
|
43 |
+
team_rank = last_game["rank_away"].values[0]
|
44 |
+
team_goals = last_game["away_goals_mean"].values[0]
|
45 |
+
team_goals_l5 = last_game["away_goals_mean_l5"].values[0]
|
46 |
+
team_goals_suf = last_game["away_goals_suf_mean"].values[0]
|
47 |
+
team_goals_suf_l5 = last_game["away_goals_suf_mean_l5"].values[0]
|
48 |
+
team_rank_suf = last_game["away_rank_mean"].values[0]
|
49 |
+
team_rank_suf_l5 = last_game["away_rank_mean_l5"].values[0]
|
50 |
+
team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
|
51 |
+
team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
|
52 |
+
|
53 |
+
return [team_rank, team_goals, team_goals_l5, team_goals_suf, team_goals_suf_l5, team_rank_suf,
|
54 |
+
team_rank_suf_l5, team_gp_rank, team_gp_rank_l5]
|
55 |
+
|
56 |
+
@staticmethod
|
57 |
+
def find_features(team_1, team_2):
|
58 |
+
"""
|
59 |
+
|
60 |
+
:param team_1:
|
61 |
+
:param team_2:
|
62 |
+
:return:
|
63 |
+
"""
|
64 |
+
rank_dif = team_1[0] - team_2[0]
|
65 |
+
goals_dif = team_1[1] - team_2[1]
|
66 |
+
goals_dif_l5 = team_1[2] - team_2[2]
|
67 |
+
goals_suf_dif = team_1[3] - team_2[3]
|
68 |
+
goals_suf_dif_l5 = team_1[4] - team_2[4]
|
69 |
+
goals_per_ranking_dif = (team_1[1] / team_1[5]) - (team_2[1] / team_2[5])
|
70 |
+
dif_rank_agst = team_1[5] - team_2[5]
|
71 |
+
dif_rank_agst_l5 = team_1[6] - team_2[6]
|
72 |
+
dif_gp_rank = team_1[7] - team_2[7]
|
73 |
+
dif_gp_rank_l5 = team_1[8] - team_2[8]
|
74 |
+
|
75 |
+
return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif,
|
76 |
+
dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]
|
77 |
+
|
78 |
+
def __predict(self, team_1: Text, team_2: Text):
|
79 |
+
|
80 |
+
team_1_stat = self.find_stats(team_1)
|
81 |
+
team_2_stat = self.find_stats(team_2)
|
82 |
+
|
83 |
+
features_g1 = self.find_features(team_1_stat, team_2_stat)
|
84 |
+
features_g2 = self.find_features(team_2_stat, team_1_stat)
|
85 |
+
|
86 |
+
probs_g1 = self.model.predict_proba([features_g1])
|
87 |
+
probs_g2 = self.model.predict_proba([features_g2])
|
88 |
+
team_1_prob_g1 = probs_g1[0][0]
|
89 |
+
team_1_prob_g2 = probs_g2[0][1]
|
90 |
+
team_2_prob_g1 = probs_g1[0][1]
|
91 |
+
team_2_prob_g2 = probs_g2[0][0]
|
92 |
+
|
93 |
+
team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
|
94 |
+
team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
|
95 |
+
|
96 |
+
return team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2
|
97 |
+
|
98 |
+
def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
|
99 |
+
"""
|
100 |
+
|
101 |
+
:param team_1:
|
102 |
+
:param team_2:
|
103 |
+
:return:
|
104 |
+
"""
|
105 |
+
draw = False
|
106 |
+
team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
|
107 |
+
team_1, team_2)
|
108 |
+
winner, winner_proba = "", 0.0
|
109 |
+
if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
|
110 |
+
(team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
|
111 |
+
draw = True
|
112 |
+
|
113 |
+
elif team_1_prob > team_2_prob:
|
114 |
+
winner = team_1
|
115 |
+
winner_proba = team_1_prob
|
116 |
+
|
117 |
+
elif team_2_prob > team_1_prob:
|
118 |
+
winner = team_2
|
119 |
+
winner_proba = team_2_prob
|
120 |
+
return draw, winner, winner_proba
|
121 |
+
|
122 |
+
def predict_all_matches(self) -> Text:
|
123 |
+
"""
|
124 |
+
Predict all the matches in the tournament
|
125 |
+
:return:
|
126 |
+
"""
|
127 |
+
result = ""
|
128 |
+
data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
|
129 |
+
table = data['table']
|
130 |
+
matches = data['matches']
|
131 |
+
advanced_group, last_group = [], ""
|
132 |
+
|
133 |
+
for teams in matches:
|
134 |
+
draw = False
|
135 |
+
team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
|
136 |
+
teams[1], teams[2])
|
137 |
+
winner, winner_proba = "", 0.0
|
138 |
+
if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
|
139 |
+
(team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
|
140 |
+
draw = True
|
141 |
+
for i in table[teams[0]]:
|
142 |
+
if i[0] == teams[1] or i[0] == teams[2]:
|
143 |
+
i[1] += 1
|
144 |
+
|
145 |
+
elif team_1_prob > team_2_prob:
|
146 |
+
winner = teams[1]
|
147 |
+
winner_proba = team_1_prob
|
148 |
+
for i in table[teams[0]]:
|
149 |
+
if i[0] == teams[1]:
|
150 |
+
i[1] += 3
|
151 |
+
|
152 |
+
elif team_2_prob > team_1_prob:
|
153 |
+
winner = teams[2]
|
154 |
+
winner_proba = team_2_prob
|
155 |
+
for i in table[teams[0]]:
|
156 |
+
if i[0] == teams[2]:
|
157 |
+
i[1] += 3
|
158 |
+
|
159 |
+
for i in table[teams[0]]: # adding tiebreaker (probs per game)
|
160 |
+
if i[0] == teams[1]:
|
161 |
+
i[2].append(team_1_prob)
|
162 |
+
if i[0] == teams[2]:
|
163 |
+
i[2].append(team_2_prob)
|
164 |
+
|
165 |
+
if last_group != teams[0]:
|
166 |
+
if last_group != "":
|
167 |
+
result += "\n"
|
168 |
+
result += "Group %s advanced: \n" % last_group
|
169 |
+
for i in table[last_group]: # adding tiebreaker
|
170 |
+
i[2] = np.mean(i[2])
|
171 |
+
|
172 |
+
final_points = table[last_group]
|
173 |
+
final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
|
174 |
+
advanced_group.append([final_table[0][0], final_table[1][0]])
|
175 |
+
for i in final_table:
|
176 |
+
result += "%s -------- %d\n" % (i[0], i[1])
|
177 |
+
result += "\n"
|
178 |
+
result += "-" * 10 + " Starting Analysis for Group %s " % (teams[0]) + "-" * 10 + "\n"
|
179 |
+
|
180 |
+
if draw is False:
|
181 |
+
result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
|
182 |
+
teams[0], teams[1], teams[2], winner, winner_proba)
|
183 |
+
else:
|
184 |
+
result += "Group %s - %s vs. %s: Draw\n" % (teams[0], teams[1], teams[2])
|
185 |
+
last_group = teams[0]
|
186 |
+
result += "\n"
|
187 |
+
result += "Group %s advanced: \n" % last_group
|
188 |
+
|
189 |
+
for i in table[last_group]: # adding tiebreaker
|
190 |
+
i[2] = np.mean(i[2])
|
191 |
+
|
192 |
+
final_points = table[last_group]
|
193 |
+
final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
|
194 |
+
advanced_group.append([final_table[0][0], final_table[1][0]])
|
195 |
+
for i in final_table:
|
196 |
+
result += "%s -------- %d\n" % (i[0], i[1])
|
197 |
+
|
198 |
+
advanced = advanced_group
|
199 |
+
playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}
|
200 |
+
|
201 |
+
for p in playoffs.keys():
|
202 |
+
playoffs[p] = []
|
203 |
+
|
204 |
+
actual_round = ""
|
205 |
+
next_rounds = []
|
206 |
+
|
207 |
+
for p in playoffs.keys():
|
208 |
+
if p == "Round of 16":
|
209 |
+
control = []
|
210 |
+
for a in range(0, len(advanced * 2), 1):
|
211 |
+
if a < len(advanced):
|
212 |
+
if a % 2 == 0:
|
213 |
+
control.append((advanced * 2)[a][0])
|
214 |
+
else:
|
215 |
+
control.append((advanced * 2)[a][1])
|
216 |
+
else:
|
217 |
+
if a % 2 == 0:
|
218 |
+
control.append((advanced * 2)[a][1])
|
219 |
+
else:
|
220 |
+
control.append((advanced * 2)[a][0])
|
221 |
+
playoffs[p] = [[control[c], control[c + 1]] for c in range(0, len(control) - 1, 1) if c % 2 == 0]
|
222 |
+
|
223 |
+
for i in range(0, len(playoffs[p]), 1):
|
224 |
+
game = playoffs[p][i]
|
225 |
+
|
226 |
+
home = game[0]
|
227 |
+
away = game[1]
|
228 |
+
|
229 |
+
team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
|
230 |
+
self.__predict(home, away)
|
231 |
+
if actual_round != p:
|
232 |
+
result += "-" * 10 + "\n"
|
233 |
+
result += "Starting simulation of %s\n" % p
|
234 |
+
result += "-" * 10 + "\n"
|
235 |
+
|
236 |
+
if team_1_prob < team_2_prob:
|
237 |
+
result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, away, team_2_prob)
|
238 |
+
next_rounds.append(away)
|
239 |
+
else:
|
240 |
+
result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, home, team_1_prob)
|
241 |
+
next_rounds.append(home)
|
242 |
+
|
243 |
+
game.append([team_1_prob, team_2_prob])
|
244 |
+
playoffs[p][i] = game
|
245 |
+
actual_round = p
|
246 |
+
|
247 |
+
else:
|
248 |
+
playoffs[p] = [[next_rounds[c], next_rounds[c + 1]] for c in range(0, len(next_rounds) - 1, 1) if
|
249 |
+
c % 2 == 0]
|
250 |
+
next_rounds = []
|
251 |
+
for i in range(0, len(playoffs[p])):
|
252 |
+
game = playoffs[p][i]
|
253 |
+
home = game[0]
|
254 |
+
away = game[1]
|
255 |
+
|
256 |
+
team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
|
257 |
+
self.__predict(home, away)
|
258 |
+
if actual_round != p:
|
259 |
+
result += "-" * 10 + "\n"
|
260 |
+
result += "Starting simulation of %s\n" % p
|
261 |
+
result += "-" * 10 + "\n"
|
262 |
+
|
263 |
+
if team_1_prob < team_2_prob:
|
264 |
+
result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, away, team_2_prob)
|
265 |
+
next_rounds.append(away)
|
266 |
+
else:
|
267 |
+
result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, home, team_1_prob)
|
268 |
+
next_rounds.append(home)
|
269 |
+
game.append([team_1_prob, team_2_prob])
|
270 |
+
playoffs[p][i] = game
|
271 |
+
actual_round = p
|
272 |
+
|
273 |
+
print(result)
|
274 |
+
return result
|
ml/utils.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
|
3 |
+
|
4 |
+
def write_pickle(path, a):
|
5 |
+
"""
|
6 |
+
|
7 |
+
Args:
|
8 |
+
path: The path to storge *.pkl file
|
9 |
+
a: An object
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
with open(path, 'wb') as handle:
|
16 |
+
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
17 |
+
return True
|
18 |
+
except Exception as e:
|
19 |
+
print(e)
|
20 |
+
return False
|
21 |
+
|
22 |
+
|
23 |
+
def load_pickle(path):
|
24 |
+
"""
|
25 |
+
|
26 |
+
Args:
|
27 |
+
path:
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
|
31 |
+
"""
|
32 |
+
with open(path, 'rb') as handle:
|
33 |
+
data = pickle.load(handle)
|
34 |
+
return data
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
lightgbm~=3.3.3
|
2 |
+
matplotlib~=3.6.2
|
3 |
+
pandas~=1.5.1
|
4 |
+
xgboost~=1.7.1
|
5 |
+
sklearn~=0.0.post1
|
6 |
+
scikit-learn~=1.1.3
|
7 |
+
omegaconf~=2.2.3
|
8 |
+
numpy~=1.23.5
|
9 |
+
Flask~=2.2.2
|
10 |
+
gradio~=3.10.1
|