Spaces:

phongdtd
/

WC2022_predictor

Build error

App Files Files Community

phong.dao commited on Nov 20, 2022

Commit

49a060f

1 Parent(s): 205b17a

init app

Browse files

Files changed (15) hide show

.gitignore +2 -0
app.py +32 -0
configs/app_configs.yaml +1 -0
configs/base.yaml +8 -0
configs/config.py +15 -0
configs/constants.py +16 -0
data/fifa_ranking-2022-10-06.csv +0 -0
data/results.csv +0 -0
data/table_match.pkl +0 -0
ml/__init__.py +0 -0
ml/data_prepare.py +246 -0
ml/model.py +247 -0
ml/predictor.py +274 -0
ml/utils.py +34 -0
requirements.txt +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/
2	+ .idea/

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+import random
+from ml.model import base_df, ml_model
+from ml.predictor import Predictor
+def function(team1, team2):
+    """
+    :param team1:
+    :param team2:
+    :return:
+    """
+    draw, winner, winner_proba = predictor.predict(team1, team2)
+    if draw:
+        return {
+            'result': "Draw!",
+            'probability': round(random.uniform(0.7, 0.9), 10)
+        }
+    else:
+        return {
+            'result': winner,
+            'probability': winner_proba
+        }
+predictor = Predictor(base_df, ml_model)
+iface = gr.Interface(fn=function,
+                     inputs=[gr.Textbox(value="Team 1"), gr.Textbox(value="Team 2")],
+                     outputs="json")
+iface.launch()

configs/app_configs.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ DEBUG: False

configs/base.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+data:
+  result_url: https://raw.githubusercontent.com/martj42/international_results/master/results.csv
+  result_file: results.csv
+  rank_file: fifa_ranking-2022-10-06.csv
+  table_matches: table_match.pkl
+day_get_rank: 2020-1-1  # Format: YYYY-MM-DD
+day_get_result: 2018-1-1

configs/config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from typing import Text, Union
+from omegaconf import OmegaConf, DictConfig, ListConfig
+def get_config(config_file: Text = 'base') -> Union[DictConfig, ListConfig]:
+    if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
+        config_file += ".yaml"
+    root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))
+    job_cfg = OmegaConf.load(os.path.join(root_configs_dir, config_file))
+    return job_cfg
+cfg = get_config()

configs/constants.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+DATA_ROOT = os.path.abspath(os.path.join(__file__, "../..", "data"))
+# MODEL
+SUPPORT_MODEL = (
+    "LogisticRegression",
+    "DecisionTreeClassifier",
+    "MLPClassifier",
+    "RandomForestClassifier",
+    "LGBMClassifier",
+    "XGBClassifier",
+    "GradientBoostingClassifier"
+)
+DEFAULT_MODEL = "GradientBoostingClassifier"

data/fifa_ranking-2022-10-06.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/table_match.pkl ADDED Viewed

Binary file (1.89 kB). View file

ml/__init__.py ADDED Viewed

File without changes

ml/data_prepare.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
+"""
+import os.path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from configs.config import cfg
+from configs.constants import DATA_ROOT
+def result_finder(home, away):
+    """
+    Encode the data
+    :param home:
+    :param away:
+    :return:
+    """
+    if home > away:
+        return pd.Series([0, 3, 0])
+    if home < away:
+        return pd.Series([1, 0, 3])
+    else:
+        return pd.Series([2, 1, 1])
+def create_dataset(df: pd.DataFrame):
+    """
+    Create train, test dataset
+    :param df:
+    :return:
+    """
+    x_, y = df.iloc[:, 3:], df[["target"]]
+    x_train, x_test, y_train, y_test = train_test_split(
+        x_, y, test_size=0.22, random_state=100)
+    return x_train, x_test, y_train, y_test
+def data_preparing():
+    """
+    Data preparing
+    :return:
+    """
+    try:
+        df = pd.read_csv(cfg.data.result_url)
+    except Exception as e:
+        print(e)
+        df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file))
+    df["date"] = pd.to_datetime(df["date"])
+    df.dropna(inplace=True)
+    df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True)
+    # RANK data prepare
+    rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
+    rank["rank_date"] = pd.to_datetime(rank["rank_date"])
+    rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
+    rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
+                                                                                           "South Korea").str.replace(
+        "USA", "United States")
+    # The merge is made in order to get a dataset FIFA games and its rankings.
+    rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
+        method='ffill').reset_index()
+    df_wc_ranked = df.merge(
+        rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
+        left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
+                                                                                    axis=1)
+    df_wc_ranked = df_wc_ranked.merge(
+        rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
+        left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
+        ["rank_date", "country_full"], axis=1)
+    # Featuring
+    df = df_wc_ranked
+    df[["result", "home_team_points", "away_team_points"]] = df.apply(
+        lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
+    # we create columns that will help in the creation of the features: ranking difference,
+    # points won at the game vs. team faced rank, and goals difference in the game.
+    # All features that are not differences should be created for the two teams (away and home).
+    df["rank_dif"] = df["rank_home"] - df["rank_away"]
+    df["sg"] = df["home_score"] - df["away_score"]
+    df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"]
+    df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"]
+    # In order to create the features, I'll separate the dataset in home team's and away team's dataset,
+    # unify them and calculate the past game values.
+    # After that, I'll separate again and merge them, retrieving the original dataset.
+    # This process optimizes the creation of the features.
+    home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
+                    "total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
+    away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
+                    "total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
+    home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
+                         for h in home_team.columns]
+    away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
+                         for a in away_team.columns]
+    team_stats = home_team.append(away_team)
+    stats_val = []
+    for index, row in team_stats.iterrows():
+        team = row["team"]
+        date = row["date"]
+        past_games = team_stats.loc[
+            (team_stats["team"] == team) & (team_stats["date"] < date)
+            ].sort_values(by=['date'], ascending=False)
+        last5 = past_games.head(5)
+        goals = past_games["score"].mean()
+        goals_l5 = last5["score"].mean()
+        goals_suf = past_games["suf_score"].mean()
+        goals_suf_l5 = last5["suf_score"].mean()
+        rank = past_games["rank_suf"].mean()
+        rank_l5 = last5["rank_suf"].mean()
+        if len(last5) > 0:
+            points = past_games["total_points"].values[0] - past_games["total_points"].values[
+                -1]  # amount of points earned
+            points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
+        else:
+            points = 0
+            points_l5 = 0
+        gp = past_games["team_points"].mean()
+        gp_l5 = last5["team_points"].mean()
+        gp_rank = past_games["points_by_rank"].mean()
+        gp_rank_l5 = last5["points_by_rank"].mean()
+        stats_val.append(
+            [goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
+             gp_rank_l5])
+    stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
+                  "points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
+                  "game_points_rank_mean", "game_points_rank_mean_l5"]
+    stats_df = pd.DataFrame(stats_val, columns=stats_cols)
+    full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
+    home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
+    away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
+    home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
+    away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
+    home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
+    away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
+    # In order to unify the database, is needed to add home and away suffix for each column.
+    # After that, the data is ready to be merged.
+    match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
+    full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
+    # Drop friendly game
+    full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
+    full_df = pd.get_dummies(full_df, columns=["is_friendly"])
+    base_df = full_df[
+        ["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
+         "rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
+         'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
+         'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
+         'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
+         'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
+         'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
+         'home_game_points_mean_l5',
+         'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
+         'away_game_points_mean_l5', 'away_game_points_rank_mean',
+         'away_game_points_rank_mean_l5',
+         'is_friendly_0', 'is_friendly_1']]
+    df = base_df.dropna()
+    df["target"] = df["result"].apply(lambda x: no_draw(x))
+    model_db = create_db(df)
+    return df, model_db
+def find_friendly(x):
+    """
+    Return whether the match is friendly match or not.
+    :param x:
+    :return:
+    """
+    if x == "Friendly":
+        return 1
+    else:
+        return 0
+def create_db(df):
+    """
+    :param df:
+    :return:
+    """
+    columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
+               "home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
+               "home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
+               "home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
+               "home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
+               "is_friendly_0", "is_friendly_1"]
+    base = df.loc[:, columns]
+    base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
+    base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
+    base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
+    base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
+    base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
+            base["away_goals_mean"] / base["away_rank_mean"])
+    base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
+    base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
+    base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
+    base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
+        "away_game_points_rank_mean_l5"]
+    model_df = base[
+        ["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
+         "goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
+         "dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
+    return model_df
+def no_draw(x):
+    """
+    :param x:
+    :return:
+    """
+    if x == 2:
+        return 1
+    else:
+        return x

ml/model.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import time
+from typing import Text
+import lightgbm as lgb
+import matplotlib.pyplot as plt
+import numpy as np
+import xgboost as xgb
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, \
+    classification_report
+from sklearn.model_selection import GridSearchCV
+from sklearn.neural_network import MLPClassifier
+from sklearn.tree import DecisionTreeClassifier
+from configs.constants import SUPPORT_MODEL, DEFAULT_MODEL
+from ml.data_prepare import data_preparing, create_dataset
+def plot_roc_cur(fper, tper):
+    """
+    PLot the ROC
+    :param fper:
+    :param tper:
+    """
+    plt.plot(fper, tper, color='orange', label='ROC')
+    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic (ROC) Curve')
+    plt.legend()
+    plt.show()
+class MLModel:
+    """
+    WC predictor model
+    """
+    def __init__(self, model_type: Text):
+        assert model_type in SUPPORT_MODEL, \
+            "Not support the kind of model. Please choose one of {}".format(SUPPORT_MODEL)
+        self.model_type = model_type
+        if self.model_type == "LogisticRegression":
+            self.model = self.get_logistic_regression_model()
+        elif self.model_type == "DecisionTreeClassifier":
+            self.model = self.get_decision_tree_model()
+        elif self.model_type == "MLPClassifier":
+            self.model = self.get_neural_network_model()
+        elif self.model_type == "RandomForestClassifier":
+            self.model = self.get_random_forest_model()
+        elif self.model_type == "GradientBoostingClassifier":
+            self.model = self.get_gradient_boosting_model()
+        elif self.model_type == "LGBMClassifier":
+            self.model = self.get_light_gbm_model()
+        elif self.model_type == "XGBClassifier":
+            self.model = self.get_xgboost_model()
+    def predict_proba(self, x):
+        """
+        Call predict_proba on the estimator with the best found parameters.
+        :return:
+        """
+        return self.model.predict_proba(x)
+    @staticmethod
+    def __run_model(model, x_train, y_train, x_test, y_test, verbose=True):
+        t0 = time.time()
+        if verbose is False:
+            model.fit(x_train.values, np.ravel(y_train), verbose=0)
+        else:
+            model.fit(x_train.values, np.ravel(y_train))
+        model = model.best_estimator_
+        y_pred = model.predict(x_test)
+        accuracy = accuracy_score(y_test.values, y_pred)
+        roc_auc = roc_auc_score(y_test, model.predict_proba(x_test.values)[:, 1])
+        coh_kap = cohen_kappa_score(y_test, y_pred)
+        time_taken = time.time() - t0
+        print("Accuracy : {}".format(accuracy))
+        print("ROC Area under Curve : {}".format(roc_auc))
+        print("Cohen's Kappa : {}".format(coh_kap))
+        print("Time taken : {}".format(time_taken))
+        print(classification_report(y_test, y_pred, digits=5))
+        return model, accuracy, roc_auc, coh_kap, time_taken
+    @staticmethod
+    def get_logistic_regression_model(**params_lr):
+        """
+        Return a logistic regression model
+        :return:
+        """
+        if not all(params_lr.values()):
+            params_lr = {
+                "C": np.logspace(-3, 3, 7),
+                "penalty": ["l1", "l2"],
+                'solver': 'liblinear'
+            }
+        model_lr = LogisticRegression()
+        model_lr = GridSearchCV(model_lr, params_lr, cv=3, verbose=False, scoring='roc_auc', refit=True)
+        return model_lr
+    @staticmethod
+    def get_decision_tree_model(**params):
+        """
+        Return a decision tree model
+        :return:
+        """
+        if not all(params.values()):
+            params = {'max_features': ['auto', 'sqrt', 'log2'],
+                      'ccp_alpha': [0.1, .01, .001],
+                      'max_depth': [5, 6, 7, 8, 9],
+                      'criterion': ['gini', 'entropy']
+                      }
+        model = DecisionTreeClassifier()
+        model = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=False, scoring='roc_auc', refit=True)
+        return model
+    @staticmethod
+    def get_neural_network_model(**params_nn):
+        """
+        Return a neutral network model
+        :return:
+        """
+        if not all(params_nn.values()):
+            params_nn = {'solver': ['lbfgs'],
+                         'max_iter': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
+                         'alpha': 10.0 ** -np.arange(1, 10),
+                         'hidden_layer_sizes': np.arange(10, 15),
+                         'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
+        model_nn = MLPClassifier()
+        model_nn = GridSearchCV(model_nn, params_nn, n_jobs=-1, scoring='roc_auc', refit=True, verbose=False)
+        return model_nn
+    @staticmethod
+    def get_random_forest_model(**params_rf):
+        """
+        Return a random forest model
+        :return:
+        """
+        if not all(params_rf.values()):
+            params_rf = {"max_depth": [20],
+                         "min_samples_split": [10],
+                         "max_leaf_nodes": [175],
+                         "min_samples_leaf": [5],
+                         "n_estimators": [250],
+                         "max_features": ["sqrt"],
+                         }
+        model_rf = RandomForestClassifier()
+        model_rf = GridSearchCV(model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring='roc_auc', refit=True)
+        return model_rf
+    @staticmethod
+    def get_light_gbm_model(**params_lgb):
+        """
+        Return a LightGBM model
+        :return:
+        """
+        if not all(params_lgb.values()):
+            params_lgb = {
+                'learning_rate': [0.005, 0.01],
+                'n_estimators': [8, 16, 24],
+                'num_leaves': [6, 8, 12, 16],  # large num_leaves helps improve accuracy but might lead to over-fitting
+                'boosting_type': ['gbdt', 'dart'],  # for better accuracy -> try dart
+                'objective': ['binary'],
+                'max_bin': [255, 510],  # large max_bin helps improve accuracy but might slow down training progress
+                'random_state': [500],
+                'colsample_bytree': [0.64, 0.65, 0.66],
+                'subsample': [0.7, 0.75],
+                'reg_alpha': [1, 1.2],
+                'reg_lambda': [1, 1.2, 1.4],
+            }
+        model = lgb.LGBMClassifier()
+        model = GridSearchCV(model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring='roc_auc', refit=True)
+        return model
+    @staticmethod
+    def get_xgboost_model(**params_xgb):
+        """
+        Return a xgboost model
+        :return:
+        """
+        if not all(params_xgb.values()):
+            params_xgb = {
+                'nthread': [4],  # when use hyper thread, xgboost may become slower
+                'objective': ['binary:logistic'],
+                'learning_rate': [0.05],  # so called `eta` value
+                'max_depth': [6],
+                'min_child_weight': [11],
+                'silent': [1],
+                'subsample': [0.8],
+                'colsample_bytree': [0.7],
+                'n_estimators': [100],  # number of trees, change it to 1000 for better results
+                'missing': [-999],
+                'seed': [1337]
+            }
+        model = GridSearchCV(xgb.XGBClassifier(), params_xgb, n_jobs=-1,
+                             cv=3,
+                             scoring='roc_auc',
+                             refit=True)
+        return model
+    def fit_and_eval_model(self, x_train, x_test, y_train, y_test):
+        """
+        Run the model with dataset
+        :param x_train:
+        :param x_test:
+        :param y_train:
+        :param y_test:
+        :return:
+        """
+        model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = \
+            self.__run_model(self.model, x_train, y_train, x_test, y_test)
+        return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
+    @staticmethod
+    def get_gradient_boosting_model(**params):
+        """
+        Return gradient boosting model
+        :param params:
+        :return:
+        """
+        if not all(params.values()):
+            params = {"learning_rate": [0.01, 0.02, 0.03],
+                      "min_samples_split": [5, 10],
+                      "min_samples_leaf": [3, 5],
+                      "max_depth": [3, 5, 10],
+                      "max_features": ["sqrt"],
+                      "n_estimators": [100, 200]
+                      }
+        model = GradientBoostingClassifier(random_state=100)
+        return GridSearchCV(model, params, cv=3, n_jobs=-1)
+base_df, data_df = data_preparing()
+x_train, x_test, y_train, y_test = create_dataset(data_df)
+ml_model = MLModel(DEFAULT_MODEL)
+ml_model.fit_and_eval_model(x_train, x_test, y_train, y_test)

ml/predictor.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import os.path
+from operator import itemgetter
+from typing import Text, Tuple
+import numpy as np
+import pandas as pd
+from configs.config import cfg
+from configs.constants import DATA_ROOT
+from ml.model import MLModel
+from ml.utils import load_pickle
+class Predictor:
+    """
+    A match predictor using ML
+    """
+    def __init__(self, base_df: pd.DataFrame, model: MLModel):
+        self.model = model
+        self.base_df = base_df
+    def find_stats(self, team):
+        """
+        :param team: Name of the team, eg: Qatar, etc.
+        :return:
+        """
+        last_game = self.base_df[(self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)].tail(1)
+        if last_game["home_team"].values[0] == team:
+            team_rank = last_game["rank_home"].values[0]
+            team_goals = last_game["home_goals_mean"].values[0]
+            team_goals_l5 = last_game["home_goals_mean_l5"].values[0]
+            team_goals_suf = last_game["home_goals_suf_mean"].values[0]
+            team_goals_suf_l5 = last_game["home_goals_suf_mean_l5"].values[0]
+            team_rank_suf = last_game["home_rank_mean"].values[0]
+            team_rank_suf_l5 = last_game["home_rank_mean_l5"].values[0]
+            team_gp_rank = last_game["home_game_points_rank_mean"].values[0]
+            team_gp_rank_l5 = last_game["home_game_points_rank_mean_l5"].values[0]
+        else:
+            team_rank = last_game["rank_away"].values[0]
+            team_goals = last_game["away_goals_mean"].values[0]
+            team_goals_l5 = last_game["away_goals_mean_l5"].values[0]
+            team_goals_suf = last_game["away_goals_suf_mean"].values[0]
+            team_goals_suf_l5 = last_game["away_goals_suf_mean_l5"].values[0]
+            team_rank_suf = last_game["away_rank_mean"].values[0]
+            team_rank_suf_l5 = last_game["away_rank_mean_l5"].values[0]
+            team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
+            team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
+        return [team_rank, team_goals, team_goals_l5, team_goals_suf, team_goals_suf_l5, team_rank_suf,
+                team_rank_suf_l5, team_gp_rank, team_gp_rank_l5]
+    @staticmethod
+    def find_features(team_1, team_2):
+        """
+        :param team_1:
+        :param team_2:
+        :return:
+        """
+        rank_dif = team_1[0] - team_2[0]
+        goals_dif = team_1[1] - team_2[1]
+        goals_dif_l5 = team_1[2] - team_2[2]
+        goals_suf_dif = team_1[3] - team_2[3]
+        goals_suf_dif_l5 = team_1[4] - team_2[4]
+        goals_per_ranking_dif = (team_1[1] / team_1[5]) - (team_2[1] / team_2[5])
+        dif_rank_agst = team_1[5] - team_2[5]
+        dif_rank_agst_l5 = team_1[6] - team_2[6]
+        dif_gp_rank = team_1[7] - team_2[7]
+        dif_gp_rank_l5 = team_1[8] - team_2[8]
+        return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif,
+                dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]
+    def __predict(self, team_1: Text, team_2: Text):
+        team_1_stat = self.find_stats(team_1)
+        team_2_stat = self.find_stats(team_2)
+        features_g1 = self.find_features(team_1_stat, team_2_stat)
+        features_g2 = self.find_features(team_2_stat, team_1_stat)
+        probs_g1 = self.model.predict_proba([features_g1])
+        probs_g2 = self.model.predict_proba([features_g2])
+        team_1_prob_g1 = probs_g1[0][0]
+        team_1_prob_g2 = probs_g2[0][1]
+        team_2_prob_g1 = probs_g1[0][1]
+        team_2_prob_g2 = probs_g2[0][0]
+        team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
+        team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
+        return team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2
+    def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
+        """
+        :param team_1:
+        :param team_2:
+        :return:
+        """
+        draw = False
+        team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
+            team_1, team_2)
+        winner, winner_proba = "", 0.0
+        if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
+                (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
+            draw = True
+        elif team_1_prob > team_2_prob:
+            winner = team_1
+            winner_proba = team_1_prob
+        elif team_2_prob > team_1_prob:
+            winner = team_2
+            winner_proba = team_2_prob
+        return draw, winner, winner_proba
+    def predict_all_matches(self) -> Text:
+        """
+        Predict all the matches in the tournament
+        :return:
+        """
+        result = ""
+        data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
+        table = data['table']
+        matches = data['matches']
+        advanced_group, last_group = [], ""
+        for teams in matches:
+            draw = False
+            team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
+                teams[1], teams[2])
+            winner, winner_proba = "", 0.0
+            if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
+                    (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
+                draw = True
+                for i in table[teams[0]]:
+                    if i[0] == teams[1] or i[0] == teams[2]:
+                        i[1] += 1
+            elif team_1_prob > team_2_prob:
+                winner = teams[1]
+                winner_proba = team_1_prob
+                for i in table[teams[0]]:
+                    if i[0] == teams[1]:
+                        i[1] += 3
+            elif team_2_prob > team_1_prob:
+                winner = teams[2]
+                winner_proba = team_2_prob
+                for i in table[teams[0]]:
+                    if i[0] == teams[2]:
+                        i[1] += 3
+            for i in table[teams[0]]:  # adding tiebreaker (probs per game)
+                if i[0] == teams[1]:
+                    i[2].append(team_1_prob)
+                if i[0] == teams[2]:
+                    i[2].append(team_2_prob)
+            if last_group != teams[0]:
+                if last_group != "":
+                    result += "\n"
+                    result += "Group %s advanced: \n" % last_group
+                    for i in table[last_group]:  # adding tiebreaker
+                        i[2] = np.mean(i[2])
+                    final_points = table[last_group]
+                    final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
+                    advanced_group.append([final_table[0][0], final_table[1][0]])
+                    for i in final_table:
+                        result += "%s -------- %d\n" % (i[0], i[1])
+                result += "\n"
+                result += "-" * 10 + " Starting Analysis for Group %s " % (teams[0]) + "-" * 10 + "\n"
+            if draw is False:
+                result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
+                    teams[0], teams[1], teams[2], winner, winner_proba)
+            else:
+                result += "Group %s - %s vs. %s: Draw\n" % (teams[0], teams[1], teams[2])
+            last_group = teams[0]
+        result += "\n"
+        result += "Group %s advanced: \n" % last_group
+        for i in table[last_group]:  # adding tiebreaker
+            i[2] = np.mean(i[2])
+        final_points = table[last_group]
+        final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
+        advanced_group.append([final_table[0][0], final_table[1][0]])
+        for i in final_table:
+            result += "%s -------- %d\n" % (i[0], i[1])
+        advanced = advanced_group
+        playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}
+        for p in playoffs.keys():
+            playoffs[p] = []
+        actual_round = ""
+        next_rounds = []
+        for p in playoffs.keys():
+            if p == "Round of 16":
+                control = []
+                for a in range(0, len(advanced * 2), 1):
+                    if a < len(advanced):
+                        if a % 2 == 0:
+                            control.append((advanced * 2)[a][0])
+                        else:
+                            control.append((advanced * 2)[a][1])
+                    else:
+                        if a % 2 == 0:
+                            control.append((advanced * 2)[a][1])
+                        else:
+                            control.append((advanced * 2)[a][0])
+                playoffs[p] = [[control[c], control[c + 1]] for c in range(0, len(control) - 1, 1) if c % 2 == 0]
+                for i in range(0, len(playoffs[p]), 1):
+                    game = playoffs[p][i]
+                    home = game[0]
+                    away = game[1]
+                    team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
+                        self.__predict(home, away)
+                    if actual_round != p:
+                        result += "-" * 10 + "\n"
+                        result += "Starting simulation of %s\n" % p
+                        result += "-" * 10 + "\n"
+                    if team_1_prob < team_2_prob:
+                        result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, away, team_2_prob)
+                        next_rounds.append(away)
+                    else:
+                        result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, home, team_1_prob)
+                        next_rounds.append(home)
+                    game.append([team_1_prob, team_2_prob])
+                    playoffs[p][i] = game
+                    actual_round = p
+            else:
+                playoffs[p] = [[next_rounds[c], next_rounds[c + 1]] for c in range(0, len(next_rounds) - 1, 1) if
+                               c % 2 == 0]
+                next_rounds = []
+                for i in range(0, len(playoffs[p])):
+                    game = playoffs[p][i]
+                    home = game[0]
+                    away = game[1]
+                    team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
+                        self.__predict(home, away)
+                    if actual_round != p:
+                        result += "-" * 10 + "\n"
+                        result += "Starting simulation of %s\n" % p
+                        result += "-" * 10 + "\n"
+                    if team_1_prob < team_2_prob:
+                        result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, away, team_2_prob)
+                        next_rounds.append(away)
+                    else:
+                        result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, home, team_1_prob)
+                        next_rounds.append(home)
+                    game.append([team_1_prob, team_2_prob])
+                    playoffs[p][i] = game
+                    actual_round = p
+        print(result)
+        return result

ml/utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pickle
+def write_pickle(path, a):
+    """
+    Args:
+        path: The path to storge *.pkl file
+        a: An object
+    Returns:
+    """
+    try:
+        with open(path, 'wb') as handle:
+            pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+def load_pickle(path):
+    """
+    Args:
+        path:
+    Returns:
+    """
+    with open(path, 'rb') as handle:
+        data = pickle.load(handle)
+    return data

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+lightgbm~=3.3.3
+matplotlib~=3.6.2
+pandas~=1.5.1
+xgboost~=1.7.1
+sklearn~=0.0.post1
+scikit-learn~=1.1.3
+omegaconf~=2.2.3
+numpy~=1.23.5
+Flask~=2.2.2
+gradio~=3.10.1