Spaces:

phongdtd
/

WC2022_predictor

Build error

WC2022_predictor / ml /data_prepare.py

phong.dao

init app

9e6c24e over 2 years ago

12.1 kB

	"""
	The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
	"""
	import os.path

	import pandas as pd
	from sklearn.model_selection import train_test_split

	from configs.config import cfg
	from configs.constants import DATA_ROOT


	def result_finder(home, away):
	"""
	Encode the data
	:param home:
	:param away:
	:return:
	"""
	if home > away:
	return pd.Series([0, 3, 0])
	if home < away:
	return pd.Series([1, 0, 3])
	else:
	return pd.Series([2, 1, 1])


	def create_dataset(df: pd.DataFrame):
	"""
	Create train, test dataset
	:param df:
	:return:
	"""
	x_, y = df.iloc[:, 3:], df[["target"]]
	x_train, x_test, y_train, y_test = train_test_split(
	x_, y, test_size=0.22, random_state=100
	)
	return x_train, x_test, y_train, y_test


	def data_preparing():
	"""
	Data preparing
	:return:
	"""
	try:
	df = pd.read_csv(cfg.data.result_url)
	except Exception as e:
	print(e)
	df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file))
	df["date"] = pd.to_datetime(df["date"])
	df.dropna(inplace=True)
	df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True)

	# RANK data prepare
	rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
	rank["rank_date"] = pd.to_datetime(rank["rank_date"])
	rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
	rank["country_full"] = (
	rank["country_full"]
	.str.replace("IR Iran", "Iran")
	.str.replace("Korea Republic", "South Korea")
	.str.replace("USA", "United States")
	)

	# The merge is made in order to get a dataset FIFA games and its rankings.
	rank = (
	rank.set_index(["rank_date"])
	.groupby(["country_full"], group_keys=False)
	.resample("D")
	.first()
	.fillna(method="ffill")
	.reset_index()
	)
	df_wc_ranked = df.merge(
	rank[
	[
	"country_full",
	"total_points",
	"previous_points",
	"rank",
	"rank_change",
	"rank_date",
	]
	],
	left_on=["date", "home_team"],
	right_on=["rank_date", "country_full"],
	).drop(["rank_date", "country_full"], axis=1)

	df_wc_ranked = df_wc_ranked.merge(
	rank[
	[
	"country_full",
	"total_points",
	"previous_points",
	"rank",
	"rank_change",
	"rank_date",
	]
	],
	left_on=["date", "away_team"],
	right_on=["rank_date", "country_full"],
	suffixes=("_home", "_away"),
	).drop(["rank_date", "country_full"], axis=1)

	# Featuring
	df = df_wc_ranked

	df[["result", "home_team_points", "away_team_points"]] = df.apply(
	lambda x: result_finder(x["home_score"], x["away_score"]), axis=1
	)

	# we create columns that will help in the creation of the features: ranking difference,
	# points won at the game vs. team faced rank, and goals difference in the game.
	# All features that are not differences should be created for the two teams (away and home).
	df["rank_dif"] = df["rank_home"] - df["rank_away"]
	df["sg"] = df["home_score"] - df["away_score"]
	df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"]
	df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"]

	# In order to create the features, I'll separate the dataset in home team's and away team's dataset,
	# unify them and calculate the past game values.
	# After that, I'll separate again and merge them, retrieving the original dataset.
	# This process optimizes the creation of the features.
	home_team = df[
	[
	"date",
	"home_team",
	"home_score",
	"away_score",
	"rank_home",
	"rank_away",
	"rank_change_home",
	"total_points_home",
	"result",
	"rank_dif",
	"points_home_by_rank",
	"home_team_points",
	]
	]

	away_team = df[
	[
	"date",
	"away_team",
	"away_score",
	"home_score",
	"rank_away",
	"rank_home",
	"rank_change_away",
	"total_points_away",
	"result",
	"rank_dif",
	"points_away_by_rank",
	"away_team_points",
	]
	]
	home_team.columns = [
	h.replace("home_", "")
	.replace("_home", "")
	.replace("away_", "suf_")
	.replace("_away", "_suf")
	for h in home_team.columns
	]

	away_team.columns = [
	a.replace("away_", "")
	.replace("_away", "")
	.replace("home_", "suf_")
	.replace("_home", "_suf")
	for a in away_team.columns
	]
	team_stats = home_team.append(away_team)

	stats_val = []

	for index, row in team_stats.iterrows():
	team = row["team"]
	date = row["date"]
	past_games = team_stats.loc[
	(team_stats["team"] == team) & (team_stats["date"] < date)
	].sort_values(by=["date"], ascending=False)
	last5 = past_games.head(5)

	goals = past_games["score"].mean()
	goals_l5 = last5["score"].mean()

	goals_suf = past_games["suf_score"].mean()
	goals_suf_l5 = last5["suf_score"].mean()

	rank = past_games["rank_suf"].mean()
	rank_l5 = last5["rank_suf"].mean()

	if len(last5) > 0:
	points = (
	past_games["total_points"].values[0]
	- past_games["total_points"].values[-1]
	) # amount of points earned
	points_l5 = (
	last5["total_points"].values[0] - last5["total_points"].values[-1]
	)
	else:
	points = 0
	points_l5 = 0

	gp = past_games["team_points"].mean()
	gp_l5 = last5["team_points"].mean()

	gp_rank = past_games["points_by_rank"].mean()
	gp_rank_l5 = last5["points_by_rank"].mean()

	stats_val.append(
	[
	goals,
	goals_l5,
	goals_suf,
	goals_suf_l5,
	rank,
	rank_l5,
	points,
	points_l5,
	gp,
	gp_l5,
	gp_rank,
	gp_rank_l5,
	]
	)

	stats_cols = [
	"goals_mean",
	"goals_mean_l5",
	"goals_suf_mean",
	"goals_suf_mean_l5",
	"rank_mean",
	"rank_mean_l5",
	"points_mean",
	"points_mean_l5",
	"game_points_mean",
	"game_points_mean_l5",
	"game_points_rank_mean",
	"game_points_rank_mean_l5",
	]

	stats_df = pd.DataFrame(stats_val, columns=stats_cols)

	full_df = pd.concat(
	[team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False
	)

	home_team_stats = full_df.iloc[: int(full_df.shape[0] / 2), :]
	away_team_stats = full_df.iloc[int(full_df.shape[0] / 2) :, :]

	home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
	away_team_stats = away_team_stats[away_team_stats.columns[-12:]]

	home_team_stats.columns = ["home_" + str(col) for col in home_team_stats.columns]
	away_team_stats.columns = ["away_" + str(col) for col in away_team_stats.columns]

	# In order to unify the database, is needed to add home and away suffix for each column.
	# After that, the data is ready to be merged.
	match_stats = pd.concat(
	[home_team_stats, away_team_stats.reset_index(drop=True)],
	axis=1,
	ignore_index=False,
	)

	full_df = pd.concat(
	[df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False
	)

	# Drop friendly game
	full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
	full_df = pd.get_dummies(full_df, columns=["is_friendly"])

	base_df = full_df[
	[
	"date",
	"home_team",
	"away_team",
	"rank_home",
	"rank_away",
	"home_score",
	"away_score",
	"result",
	"rank_dif",
	"rank_change_home",
	"rank_change_away",
	"home_goals_mean",
	"home_goals_mean_l5",
	"home_goals_suf_mean",
	"home_goals_suf_mean_l5",
	"home_rank_mean",
	"home_rank_mean_l5",
	"home_points_mean",
	"home_points_mean_l5",
	"away_goals_mean",
	"away_goals_mean_l5",
	"away_goals_suf_mean",
	"away_goals_suf_mean_l5",
	"away_rank_mean",
	"away_rank_mean_l5",
	"away_points_mean",
	"away_points_mean_l5",
	"home_game_points_mean",
	"home_game_points_mean_l5",
	"home_game_points_rank_mean",
	"home_game_points_rank_mean_l5",
	"away_game_points_mean",
	"away_game_points_mean_l5",
	"away_game_points_rank_mean",
	"away_game_points_rank_mean_l5",
	"is_friendly_0",
	"is_friendly_1",
	]
	]

	df = base_df.dropna()

	df["target"] = df["result"].apply(lambda x: no_draw(x))

	model_db = create_db(df)

	return df, model_db


	def find_friendly(x):
	"""
	Return whether the match is friendly match or not.
	:param x:
	:return:
	"""
	if x == "Friendly":
	return 1
	else:
	return 0


	def create_db(df):
	"""

	:param df:
	:return:
	"""
	columns = [
	"home_team",
	"away_team",
	"target",
	"rank_dif",
	"home_goals_mean",
	"home_rank_mean",
	"away_goals_mean",
	"away_rank_mean",
	"home_rank_mean_l5",
	"away_rank_mean_l5",
	"home_goals_suf_mean",
	"away_goals_suf_mean",
	"home_goals_mean_l5",
	"away_goals_mean_l5",
	"home_goals_suf_mean_l5",
	"away_goals_suf_mean_l5",
	"home_game_points_rank_mean",
	"home_game_points_rank_mean_l5",
	"away_game_points_rank_mean",
	"away_game_points_rank_mean_l5",
	"is_friendly_0",
	"is_friendly_1",
	]

	base = df.loc[:, columns]
	base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
	base.loc[:, "goals_dif_l5"] = (
	base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
	)
	base.loc[:, "goals_suf_dif"] = (
	base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
	)
	base.loc[:, "goals_suf_dif_l5"] = (
	base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
	)
	base.loc[:, "goals_per_ranking_dif"] = (
	base["home_goals_mean"] / base["home_rank_mean"]
	) - (base["away_goals_mean"] / base["away_rank_mean"])
	base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
	base.loc[:, "dif_rank_agst_l5"] = (
	base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
	)
	base.loc[:, "dif_points_rank"] = (
	base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
	)
	base.loc[:, "dif_points_rank_l5"] = (
	base["home_game_points_rank_mean_l5"] - base["away_game_points_rank_mean_l5"]
	)

	model_df = base[
	[
	"home_team",
	"away_team",
	"target",
	"rank_dif",
	"goals_dif",
	"goals_dif_l5",
	"goals_suf_dif",
	"goals_suf_dif_l5",
	"goals_per_ranking_dif",
	"dif_rank_agst",
	"dif_rank_agst_l5",
	"dif_points_rank",
	"dif_points_rank_l5",
	"is_friendly_0",
	"is_friendly_1",
	]
	]
	return model_df


	def no_draw(x):
	"""

	:param x:
	:return:
	"""
	if x == 2:
	return 1
	else:
	return x