Spaces:

scfive
/

socr

Configuration error

File size: 16,924 Bytes

d6ea71e

"""Implements the xT framework."""

import json
import os
from typing import Callable, Optional

import numpy as np
import numpy.typing as npt
import pandas as pd
from pandera.typing import DataFrame, Series
from sklearn.exceptions import NotFittedError

import socceraction.spadl.config as spadlconfig
from socceraction.spadl.schema import SPADLSchema

try:
    from scipy.interpolate import interp2d  # type: ignore
except ImportError:  # pragma: no cover
    interp2d = None

M: int = 12
N: int = 16


def _get_cell_indexes(
    x: Series[float], y: Series[float], l: int = N, w: int = M
) -> tuple[Series[int], Series[int]]:
    xi = x.divide(spadlconfig.field_length).multiply(l)
    yj = y.divide(spadlconfig.field_width).multiply(w)
    xi = xi.astype("int64").clip(0, l - 1)
    yj = yj.astype("int64").clip(0, w - 1)
    return xi, yj


def _get_flat_indexes(x: Series[float], y: Series[float], l: int = N, w: int = M) -> Series[int]:
    xi, yj = _get_cell_indexes(x, y, l, w)
    return yj.rsub(w - 1).mul(l).add(xi)


def _count(x: Series[float], y: Series[float], l: int = N, w: int = M) -> npt.NDArray[np.int_]:
    """Count the number of actions occurring in each cell of the grid.

    Parameters
    ----------
    x : pd.Series
        The x-coordinates of the actions.
    y : pd.Series
        The y-coordinates of the actions.
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.

    Returns
    -------
    np.ndarray
        A matrix, denoting the amount of actions occurring in each cell. The
        top-left corner is the origin.
    """
    x = x[~np.isnan(x) & ~np.isnan(y)]
    y = y[~np.isnan(x) & ~np.isnan(y)]

    flat_indexes = _get_flat_indexes(x, y, l, w)
    vc = flat_indexes.value_counts(sort=False)
    vector = np.zeros(w * l, dtype=int)
    vector[vc.index] = vc
    return vector.reshape((w, l))


def _safe_divide(a: npt.ArrayLike, b: npt.ArrayLike) -> npt.NDArray[np.float64]:
    return np.divide(a, b, out=np.zeros_like(a, dtype="float64"), where=b != 0, casting="unsafe")


def scoring_prob(
    actions: DataFrame[SPADLSchema], l: int = N, w: int = M
) -> npt.NDArray[np.float64]:
    """Compute the probability of scoring when taking a shot for each cell.

    Parameters
    ----------
    actions : pd.DataFrame
        Actions, in SPADL format.
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.

    Returns
    -------
    np.ndarray
        A matrix, denoting the probability of scoring for each cell.
    """
    shot_actions = actions[(actions.type_id == spadlconfig.actiontypes.index("shot"))]
    goals = shot_actions[(shot_actions.result_id == spadlconfig.results.index("success"))]

    shotmatrix = _count(shot_actions.start_x, shot_actions.start_y, l, w)
    goalmatrix = _count(goals.start_x, goals.start_y, l, w)
    return _safe_divide(goalmatrix, shotmatrix)


def get_move_actions(actions: DataFrame[SPADLSchema]) -> DataFrame[SPADLSchema]:
    """Get all ball-progressing actions.

    These include passes, dribbles and crosses. Take-ons are ignored because
    they typically coincide with dribbles and do not move the ball to
    a different cell.

    Parameters
    ----------
    actions : pd.DataFrame
        Actions, in SPADL format.

    Returns
    -------
    pd.DataFrame
        All ball-progressing actions in the input dataframe.
    """
    return actions[
        (actions.type_id == spadlconfig.actiontypes.index("pass"))
        | (actions.type_id == spadlconfig.actiontypes.index("dribble"))
        | (actions.type_id == spadlconfig.actiontypes.index("cross"))
    ]


def get_successful_move_actions(actions: DataFrame[SPADLSchema]) -> DataFrame[SPADLSchema]:
    """Get all successful ball-progressing actions.

    These include successful passes, dribbles and crosses.

    Parameters
    ----------
    actions : pd.DataFrame
        Actions, in SPADL format.

    Returns
    -------
    pd.DataFrame
        All ball-progressing actions in the input dataframe.
    """
    move_actions = get_move_actions(actions)
    return move_actions[(move_actions.result_id == spadlconfig.results.index("success"))]


def action_prob(
    actions: DataFrame[SPADLSchema], l: int = N, w: int = M
) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
    """Compute the probability of taking an action in each cell of the grid.

    The options are: shooting or moving.

    Parameters
    ----------
    actions : pd.DataFrame
        Actions, in SPADL format.
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.

    Returns
    -------
    shotmatrix : np.ndarray
        For each cell the probability of choosing to shoot.
    movematrix : np.ndarray
        For each cell the probability of choosing to move.
    """
    move_actions = get_move_actions(actions)
    shot_actions = actions[(actions.type_id == spadlconfig.actiontypes.index("shot"))]

    movematrix = _count(move_actions.start_x, move_actions.start_y, l, w)
    shotmatrix = _count(shot_actions.start_x, shot_actions.start_y, l, w)
    totalmatrix = movematrix + shotmatrix

    return _safe_divide(shotmatrix, totalmatrix), _safe_divide(movematrix, totalmatrix)


def move_transition_matrix(
    actions: DataFrame[SPADLSchema], l: int = N, w: int = M
) -> npt.NDArray[np.float64]:
    """Compute the move transition matrix from the given actions.

    This is, when a player chooses to move, the probability that he will
    end up in each of the other cells of the grid successfully.

    Parameters
    ----------
    actions : pd.DataFrame
        Actions, in SPADL format.
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.

    Returns
    -------
    np.ndarray
        The transition matrix.
    """
    move_actions = get_move_actions(actions)

    X = pd.DataFrame()
    X["start_cell"] = _get_flat_indexes(move_actions.start_x, move_actions.start_y, l, w)
    X["end_cell"] = _get_flat_indexes(move_actions.end_x, move_actions.end_y, l, w)
    X["result_id"] = move_actions.result_id

    vc = X.start_cell.value_counts(sort=False)
    start_counts = np.zeros(w * l)
    start_counts[vc.index] = vc

    transition_matrix = np.zeros((w * l, w * l))

    for i in range(0, w * l):
        vc2 = X[
            ((X.start_cell == i) & (X.result_id == spadlconfig.results.index("success")))
        ].end_cell.value_counts(sort=False)
        transition_matrix[i, vc2.index] = vc2 / start_counts[i]

    return transition_matrix


class ExpectedThreat:
    """An implementation of the Expected Threat (xT) model.

    The xT model [1]_ can be used to value actions that successfully move
    the ball between two locations on the pitch by computing the difference
    between the long-term probability of scoring on the start and end location
    of an action.

    Parameters
    ----------
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.
    eps : float
       The desired precision to calculate the xT value of a cell. Default is
       5 decimal places of precision (1e-5).

    Attributes
    ----------
    l : int
        Amount of grid cells in the x-dimension of the grid.
    w : int
        Amount of grid cells in the y-dimension of the grid.
    eps : float
       The desired precision to calculate the xT value of a cell. Default is
       5 decimal places of precision (1e-5).
    heatmaps : list(np.ndarray)
        The i-th element corresponds to the xT value surface after i iterations.
    xT : np.ndarray
        The final xT value surface.
    scoring_prob_matrix : np.ndarray, shape(M,N)
        The probability of scoring when taking a shot for each cell.
    shot_prob_matrix : np.ndarray, shape(M,N)
        The probability of choosing to shoot for each cell.
    move_prob_matrix : np.ndarray, shape(M,N)
        The probability of choosing to move for each cell.
    transition_matrix : np.ndarray, shape(M*N,M*N)
        When moving, the probability of moving to each of the other zones.

    References
    ----------
    .. [1] Singh, Karun. "Introducing Expected Threat (xT)." 15 February, 2019.
        https://karun.in/blog/expected-threat.html
    """

    def __init__(self, l: int = N, w: int = M, eps: float = 1e-5) -> None:
        self.l = l
        self.w = w
        self.eps = eps
        self.heatmaps: list[npt.NDArray[np.float64]] = []
        self.xT: npt.NDArray[np.float64] = np.zeros((self.w, self.l))
        self.scoring_prob_matrix: Optional[npt.NDArray[np.float64]] = None
        self.shot_prob_matrix: Optional[npt.NDArray[np.float64]] = None
        self.move_prob_matrix: Optional[npt.NDArray[np.float64]] = None
        self.transition_matrix: Optional[npt.NDArray[np.float64]] = None

    def __solve(
        self,
        p_scoring: npt.NDArray[np.float64],
        p_shot: npt.NDArray[np.float64],
        p_move: npt.NDArray[np.float64],
        transition_matrix: npt.NDArray[np.float64],
    ) -> None:
        """Solves the expected threat equation with dynamic programming.

        Parameters
        ----------
        p_scoring : (np.ndarray, shape(M, N)):
            Probability of scoring at each grid cell, when shooting from that cell.
        p_shot : (np.ndarray, shape(M,N)):
            For each grid cell, the probability of choosing to shoot from there.
        p_move : (np.ndarray, shape(M,N)):
            For each grid cell, the probability of choosing to move from there.
        transition_matrix : (np.ndarray, shape(M*N,M*N)):
            When moving, the probability of moving to each of the other zones.
        """
        gs = p_scoring * p_shot
        diff = np.ones((self.w, self.l), dtype=np.float64)
        it = 0
        self.heatmaps.append(self.xT.copy())

        while np.any(diff > self.eps):
            total_payoff = np.zeros((self.w, self.l), dtype=np.float64)

            for y in range(0, self.w):
                for x in range(0, self.l):
                    for q in range(0, self.w):
                        for z in range(0, self.l):
                            total_payoff[y, x] += (
                                transition_matrix[self.l * y + x, self.l * q + z] * self.xT[q, z]
                            )

            newxT = gs + (p_move * total_payoff)
            diff = newxT - self.xT
            self.xT = newxT
            self.heatmaps.append(self.xT.copy())
            it += 1

        print("# iterations: ", it)

    def fit(self, actions: DataFrame[SPADLSchema]) -> "ExpectedThreat":
        """Fits the xT model with the given actions.

        Parameters
        ----------
        actions : pd.DataFrame
            Actions, in SPADL format.

        Returns
        -------
        self
            Fitted xT model.
        """
        self.scoring_prob_matrix = scoring_prob(actions, self.l, self.w)
        self.shot_prob_matrix, self.move_prob_matrix = action_prob(actions, self.l, self.w)
        self.transition_matrix = move_transition_matrix(actions, self.l, self.w)
        self.xT = np.zeros((self.w, self.l))
        self.__solve(
            self.scoring_prob_matrix,
            self.shot_prob_matrix,
            self.move_prob_matrix,
            self.transition_matrix,
        )
        return self

    def interpolator(
        self, kind: str = "linear"
    ) -> Callable[[npt.NDArray[np.float64], npt.NDArray[np.float64]], npt.NDArray[np.float64]]:
        """Interpolate over the pitch.

        This is a wrapper around :func:`scipy.interpolate.interp2d`.

        Parameters
        ----------
        kind : {'linear', 'cubic', 'quintic'}  # noqa: DAR103
            The kind of spline interpolation to use. Default is ‘linear’.

        Raises
        ------
        ImportError
            If scipy is not installed.

        Returns
        -------
        callable
            A function that interpolates xT values over the pitch.
        """
        if interp2d is None:
            raise ImportError("Interpolation requires scipy to be installed.")

        cell_length = spadlconfig.field_length / self.l
        cell_width = spadlconfig.field_width / self.w

        x = np.arange(0.0, spadlconfig.field_length, cell_length) + 0.5 * cell_length
        y = np.arange(0.0, spadlconfig.field_width, cell_width) + 0.5 * cell_width

        return interp2d(x=x, y=y, z=self.xT, kind=kind, bounds_error=False)

    def rate(
        self, actions: DataFrame[SPADLSchema], use_interpolation: bool = False
    ) -> npt.NDArray[np.float64]:
        """Compute the xT values for the given actions.

        xT should only be used to value actions that move the ball and also
        keep the current team in possession of the ball. All other actions in
        the given dataframe receive a `NaN` rating.

        Parameters
        ----------
        actions : pd.DataFrame
            Actions, in SPADL format.
        use_interpolation : bool
            Indicates whether to use bilinear interpolation when inferring xT
            values. Note that this requires Scipy to be installed (pip install
            scipy).

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet.

        Returns
        -------
        np.ndarray
            The xT value for each action.
        """
        if not np.any(self.xT):
            raise NotFittedError()

        if not use_interpolation:
            l = self.l
            w = self.w
            grid = self.xT
        else:
            # Use interpolation to create a
            # more fine-grained 1050 x 680 grid
            interp = self.interpolator()
            l = int(spadlconfig.field_length * 10)
            w = int(spadlconfig.field_width * 10)
            xs = np.linspace(0, spadlconfig.field_length, l)
            ys = np.linspace(0, spadlconfig.field_width, w)
            grid = interp(xs, ys)

        ratings = np.empty(len(actions))
        ratings[:] = np.NaN

        move_actions = get_successful_move_actions(actions.reset_index())

        startxc, startyc = _get_cell_indexes(move_actions.start_x, move_actions.start_y, l, w)
        endxc, endyc = _get_cell_indexes(move_actions.end_x, move_actions.end_y, l, w)

        xT_start = grid[startyc.rsub(w - 1), startxc]
        xT_end = grid[endyc.rsub(w - 1), endxc]

        ratings[move_actions.index] = xT_end - xT_start
        return ratings

    def save_model(self, filepath: str, overwrite: bool = True) -> None:
        """Save the xT value surface in JSON format.

        This stores only the xT value surface, which is all you need to compute
        xT values for new data. The value surface can be loaded back with the
        :func:`socceraction.xthreat.load_model` function.

        Pickle the `ExpectedThreat` instance to store the entire model and to
        retain the transition, shot probability, move probability and scoring
        probability matrices.

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet.
        ValueError
            If the specified output file already exists and "overwrite" is set
            to False.

        Parameters
        ----------
        filepath : str
            Path to the file to save the value surface to.
        overwrite : bool
            Whether to silently overwrite any existing file at the target
            location.
        """
        if not np.any(self.xT):
            raise NotFittedError()

        # If file exists and should not be overwritten:
        if not overwrite and os.path.isfile(filepath):
            raise ValueError(
                'save_xt got overwrite="False", but a file '
                f"({filepath}) exists already. No data was saved."
            )
        with open(filepath, "w") as f:
            json.dump(self.xT.tolist(), f)


def load_model(path: str) -> ExpectedThreat:
    """Create a model from a pre-computed xT value surface.

    The value surface should be provided as a JSON file containing a 2D
    matrix. Karun Singh provides such a grid at the follwing url:
    https://karun.in/blog/data/open_xt_12x8_v1.json

    Parameters
    ----------
    path : str
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file.

    Returns
    -------
    ExpectedThreat
        An xT model that uses the given value surface to value actions.
    """
    grid = pd.read_json(path)
    model = ExpectedThreat()
    model.xT = grid.values
    model.w, model.l = model.xT.shape
    return model