Upload 203 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- CONTRIBUTING.rst +205 -0
- LICENSE.rst +22 -0
- Makefile +50 -0
- README.md +66 -12
- __init__.py +13 -0
- atomic/__init__.py +1 -0
- atomic/spadl/__init__.py +15 -0
- atomic/spadl/base.py +236 -0
- atomic/spadl/config.py +48 -0
- atomic/spadl/schema.py +32 -0
- atomic/spadl/utils.py +65 -0
- atomic/vaep/__init__.py +6 -0
- atomic/vaep/base.py +80 -0
- atomic/vaep/features.py +269 -0
- atomic/vaep/formula.py +142 -0
- atomic/vaep/labels.py +108 -0
- data/__init__.py +9 -0
- data/base.py +211 -0
- data/opta/__init__.py +19 -0
- data/opta/loader.py +478 -0
- data/opta/parsers/__init__.py +23 -0
- data/opta/parsers/base.py +179 -0
- data/opta/parsers/f1_json.py +103 -0
- data/opta/parsers/f24_json.py +123 -0
- data/opta/parsers/f24_xml.py +108 -0
- data/opta/parsers/f7_xml.py +250 -0
- data/opta/parsers/f9_json.py +302 -0
- data/opta/parsers/ma1_json.py +264 -0
- data/opta/parsers/ma3_json.py +355 -0
- data/opta/parsers/whoscored.py +421 -0
- data/opta/schema.py +86 -0
- data/schema.py +110 -0
- data/statsbomb/__init__.py +20 -0
- data/statsbomb/loader.py +495 -0
- data/statsbomb/schema.py +100 -0
- data/wyscout/__init__.py +20 -0
- data/wyscout/loader.py +849 -0
- data/wyscout/schema.py +48 -0
- docs/_static/custom.css +50 -0
- docs/_static/decroos19.bibtex +17 -0
- docs/_static/favicon.png +0 -0
- docs/_static/logo.ai +3 -0
- docs/_static/logo.png +0 -0
- docs/_static/logo_white.png +0 -0
- docs/_static/vanroy20.bibtex +11 -0
- docs/_templates/class.rst +40 -0
- docs/_templates/module.rst +27 -0
- docs/_templates/schema.rst +19 -0
- docs/actions_bra-bel.png +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
docs/_static/logo.ai filter=lfs diff=lfs merge=lfs -text
|
37 |
+
docs/documentation/valuing_actions/action_changes_gamestate.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
docs/documentation/valuing_actions/action.gif filter=lfs diff=lfs merge=lfs -text
|
39 |
+
docs/documentation/valuing_actions/default_xt_grid.png filter=lfs diff=lfs merge=lfs -text
|
CONTRIBUTING.rst
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Contributor guide
|
2 |
+
=================
|
3 |
+
|
4 |
+
This document lays out guidelines and advice for contributing to this project.
|
5 |
+
If you're thinking of contributing, please start by reading this document and
|
6 |
+
getting a feel for how contributing to this project works. If you have any
|
7 |
+
questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer.
|
8 |
+
|
9 |
+
.. _Pieter Robberechts: http://www.cs.kuleuven.be/cgi-bin/e-post.pl?epost=Pieter.Robberechts
|
10 |
+
|
11 |
+
The guide is split into sections based on the type of contribution you're
|
12 |
+
thinking of making.
|
13 |
+
|
14 |
+
|
15 |
+
.. _bug-reports:
|
16 |
+
|
17 |
+
Bug reports
|
18 |
+
-----------
|
19 |
+
|
20 |
+
Bug reports are hugely important! Before you raise one, though, please check
|
21 |
+
through the `GitHub issues`_, **both open and closed**, to confirm that the bug
|
22 |
+
hasn't been reported before.
|
23 |
+
|
24 |
+
When filing an issue, make sure to answer these questions:
|
25 |
+
|
26 |
+
- Which Python version are you using?
|
27 |
+
- Which version of socceraction are you using?
|
28 |
+
- What did you do?
|
29 |
+
- What did you expect to see?
|
30 |
+
- What did you see instead?
|
31 |
+
|
32 |
+
The best way to get your bug fixed is to provide a test case,
|
33 |
+
and/or steps to reproduce the issue.
|
34 |
+
|
35 |
+
.. _GitHub issues: https://github.com/ML-KULeuven/socceraction/issues
|
36 |
+
|
37 |
+
|
38 |
+
Feature requests
|
39 |
+
----------------
|
40 |
+
|
41 |
+
Socceraction is not actively developed. Its primary use is to enable
|
42 |
+
reproducibility of our research. If you believe there is a feature missing,
|
43 |
+
feel free to raise a feature request on the `Issue Tracker`_, but please do be
|
44 |
+
aware that the overwhelming likelihood is that your feature request will not
|
45 |
+
be accepted.
|
46 |
+
|
47 |
+
.. _Issue tracker: https://github.com/ML-KULeuven/socceraction/issues
|
48 |
+
|
49 |
+
|
50 |
+
Documentation contributions
|
51 |
+
---------------------------
|
52 |
+
|
53 |
+
Documentation improvements are always welcome! The documentation files live in
|
54 |
+
the ``docs/`` directory of the codebase. They're written in
|
55 |
+
`reStructuredText`_, and use `Sphinx`_ to generate the full suite of
|
56 |
+
documentation.
|
57 |
+
|
58 |
+
You do not have to set up a development environment to make small changes to
|
59 |
+
the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes.
|
60 |
+
|
61 |
+
When contributing documentation, please do your best to follow the style of the
|
62 |
+
documentation files. This means a soft-limit of 79 characters wide in your text
|
63 |
+
files and a semiformal, yet friendly and approachable, prose style.
|
64 |
+
|
65 |
+
When presenting Python code, use double-quoted strings (``"hello"`` instead of
|
66 |
+
``'hello'``).
|
67 |
+
|
68 |
+
.. _reStructuredText: http://docutils.sourceforge.net/rst.html
|
69 |
+
.. _Sphinx: http://sphinx-doc.org/index.html
|
70 |
+
.. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
|
71 |
+
|
72 |
+
|
73 |
+
Code contributions
|
74 |
+
------------------
|
75 |
+
|
76 |
+
If you intend to contribute code, do not feel the need to sit on your
|
77 |
+
contribution until it is perfectly polished and complete. It helps everyone
|
78 |
+
involved for you to seek feedback as early as you possibly can. Submitting an
|
79 |
+
early, unfinished version of your contribution for feedback can save you from
|
80 |
+
putting a lot of work into a contribution that is not suitable for the
|
81 |
+
project.
|
82 |
+
|
83 |
+
Setting up your development environment
|
84 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
85 |
+
|
86 |
+
You need Python 3.9+ and the following tools:
|
87 |
+
|
88 |
+
- Poetry_
|
89 |
+
- Nox_
|
90 |
+
- nox-poetry_
|
91 |
+
|
92 |
+
Install the package with development requirements:
|
93 |
+
|
94 |
+
.. code:: console
|
95 |
+
|
96 |
+
$ poetry install
|
97 |
+
$ poetry self add poetry-plugin-export
|
98 |
+
|
99 |
+
You can now run an interactive Python session.
|
100 |
+
|
101 |
+
.. code:: console
|
102 |
+
|
103 |
+
$ poetry run python
|
104 |
+
|
105 |
+
.. _Poetry: https://python-poetry.org/
|
106 |
+
.. _Nox: https://nox.thea.codes/
|
107 |
+
.. _nox-poetry: https://nox-poetry.readthedocs.io/
|
108 |
+
|
109 |
+
Steps for submitting code
|
110 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
111 |
+
|
112 |
+
When contributing code, you'll want to follow this checklist:
|
113 |
+
|
114 |
+
1. Fork the repository on GitHub.
|
115 |
+
2. Run the tests to confirm they all pass on your system. If they don't, you'll
|
116 |
+
need to investigate why they fail. If you're unable to diagnose this
|
117 |
+
yourself, raise it as a bug report.
|
118 |
+
3. Write tests that demonstrate your bug or feature. Ensure that they fail.
|
119 |
+
4. Make your change.
|
120 |
+
5. Run the entire test suite again, confirming that all tests pass *including
|
121 |
+
the ones you just added*.
|
122 |
+
6. Make sure your code follows the code style discussed below.
|
123 |
+
7. Send a GitHub Pull Request to the main repository's ``master`` branch.
|
124 |
+
GitHub Pull Requests are the expected method of code collaboration on this
|
125 |
+
project.
|
126 |
+
|
127 |
+
Testing the project
|
128 |
+
~~~~~~~~~~~~~~~~~~~
|
129 |
+
|
130 |
+
Download the test data:
|
131 |
+
|
132 |
+
.. code:: console
|
133 |
+
|
134 |
+
$ poetry run python tests/datasets/download.py
|
135 |
+
|
136 |
+
Run the full test suite:
|
137 |
+
|
138 |
+
.. code:: console
|
139 |
+
|
140 |
+
$ nox
|
141 |
+
|
142 |
+
List the available Nox sessions:
|
143 |
+
|
144 |
+
.. code:: console
|
145 |
+
|
146 |
+
$ nox --list-sessions
|
147 |
+
|
148 |
+
You can also run a specific Nox session.
|
149 |
+
For example, invoke the unit test suite like this:
|
150 |
+
|
151 |
+
.. code:: console
|
152 |
+
|
153 |
+
$ nox --session=tests
|
154 |
+
|
155 |
+
Unit tests are located in the ``tests`` directory,
|
156 |
+
and are written using the pytest_ testing framework.
|
157 |
+
|
158 |
+
.. _pytest: https://pytest.readthedocs.io/
|
159 |
+
|
160 |
+
Code style
|
161 |
+
~~~~~~~~~~~~
|
162 |
+
|
163 |
+
The socceraction codebase uses the `PEP 8`_ code style. In addition, we have
|
164 |
+
a few guidelines:
|
165 |
+
|
166 |
+
- Line-length can exceed 79 characters, to 100, when convenient.
|
167 |
+
- Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient.
|
168 |
+
- Always use double-quoted strings (e.g. ``"soccer"``), unless a double-quote occurs within the string.
|
169 |
+
|
170 |
+
To ensure all code conforms to this format. You can format the code using the
|
171 |
+
pre-commit hooks.
|
172 |
+
|
173 |
+
.. code:: console
|
174 |
+
|
175 |
+
$ nox --session=pre-commit
|
176 |
+
|
177 |
+
Docstrings are to follow the `numpydoc guidelines`_.
|
178 |
+
|
179 |
+
.. _PEP 8: https://pep8.org/
|
180 |
+
.. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html
|
181 |
+
|
182 |
+
Submitting changes
|
183 |
+
~~~~~~~~~~~~~~~~~~
|
184 |
+
|
185 |
+
Open a `pull request`_ to submit changes to this project.
|
186 |
+
|
187 |
+
Your pull request needs to meet the following guidelines for acceptance:
|
188 |
+
|
189 |
+
- The Nox test suite must pass without errors and warnings.
|
190 |
+
- Include unit tests.
|
191 |
+
- If your changes add functionality, update the documentation accordingly.
|
192 |
+
|
193 |
+
Feel free to submit early, though. We can always iterate on this.
|
194 |
+
|
195 |
+
To run linting and code formatting checks before committing your change, you
|
196 |
+
can install pre-commit as a Git hook by running the following command:
|
197 |
+
|
198 |
+
.. code:: console
|
199 |
+
|
200 |
+
$ nox --session=pre-commit -- install
|
201 |
+
|
202 |
+
It is recommended to open an issue before starting work on anything.
|
203 |
+
|
204 |
+
.. _pull request: https://github.com/ML-KULeuven/socceraction/pulls
|
205 |
+
.. github-only
|
LICENSE.rst
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
===========
|
3 |
+
|
4 |
+
Copyright (c) 2019 KU Leuven Machine Learning Research Group
|
5 |
+
|
6 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
of this software and associated documentation files (the "Software"), to deal
|
8 |
+
in the Software without restriction, including without limitation the rights
|
9 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
copies of the Software, and to permit persons to whom the Software is
|
11 |
+
furnished to do so, subject to the following conditions:
|
12 |
+
|
13 |
+
The above copyright notice and this permission notice shall be included in all
|
14 |
+
copies or substantial portions of the Software.
|
15 |
+
|
16 |
+
**The software is provided "as is", without warranty of any kind, express or
|
17 |
+
implied, including but not limited to the warranties of merchantability,
|
18 |
+
fitness for a particular purpose and noninfringement. In no event shall the
|
19 |
+
authors or copyright holders be liable for any claim, damages or other
|
20 |
+
liability, whether in an action of contract, tort or otherwise, arising from,
|
21 |
+
out of or in connection with the software or the use or other dealings in the
|
22 |
+
software.**
|
Makefile
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: init test lint pretty notebooks precommit_install bump_major bump_minor bump_patch clean
|
2 |
+
|
3 |
+
BIN = .venv/bin/
|
4 |
+
CODE = socceraction
|
5 |
+
|
6 |
+
init:
|
7 |
+
python3 -m venv .venv
|
8 |
+
poetry install
|
9 |
+
|
10 |
+
tests/datasets/statsbomb/:
|
11 |
+
$(BIN)python tests/datasets/download.py --download-statsbomb --convert-statsbomb
|
12 |
+
|
13 |
+
tests/datasets/wyscout_public/:
|
14 |
+
$(BIN)python tests/datasets/download.py --download-wyscout --convert-wyscout
|
15 |
+
|
16 |
+
tests/datasets/spadl/:
|
17 |
+
$(BIN)python tests/datasets/download.py --spadl
|
18 |
+
|
19 |
+
test: tests/datasets/statsbomb/ tests/datasets/wyscout_public/ tests/datasets/spadl/
|
20 |
+
nox -rs tests -- $(args)
|
21 |
+
|
22 |
+
mypy:
|
23 |
+
nox -rs mypy -- $(args)
|
24 |
+
|
25 |
+
lint:
|
26 |
+
nox -rs lint -- $(args)
|
27 |
+
|
28 |
+
pretty:
|
29 |
+
nox -rs pre-commit -- $(args)
|
30 |
+
|
31 |
+
notebooks:
|
32 |
+
$(BIN)python -m nbconvert --execute --inplace --config=default.json public-notebooks/*.ipynb
|
33 |
+
|
34 |
+
precommit_install:
|
35 |
+
nox -rs pre-commit -- install
|
36 |
+
|
37 |
+
bump_major:
|
38 |
+
$(BIN)bumpversion major
|
39 |
+
|
40 |
+
bump_minor:
|
41 |
+
$(BIN)bumpversion minor
|
42 |
+
|
43 |
+
bump_patch:
|
44 |
+
$(BIN)bumpversion patch
|
45 |
+
|
46 |
+
clean:
|
47 |
+
find . -type f -name "*.py[co]" -delete
|
48 |
+
find . -type d -name "__pycache__" -delete
|
49 |
+
rm -rf tests/datasets/wyscout_public
|
50 |
+
rm -rf tests/datasets/statsbomb
|
README.md
CHANGED
@@ -1,12 +1,66 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
<img src="https://socceraction.readthedocs.io/en/latest/_static/logo_white.png" height="200">
|
3 |
+
<p>
|
4 |
+
<b>Convert soccer event stream data to the SPADL format<br/>and value on-the-ball player actions</b>
|
5 |
+
</p>
|
6 |
+
<br/>
|
7 |
+
|
8 |
+
[](https://pypi.org/project/socceraction)
|
9 |
+
[](https://pypi.org/project/socceraction)
|
10 |
+
[](https://pypistats.org/packages/socceraction)
|
11 |
+
[](https://en.wikipedia.org/wiki/MIT_License)
|
12 |
+
|
13 |
+
[](https://github.com/ML-KULeuven/socceraction/actions?workflow=CI)
|
14 |
+
[](https://socceraction.readthedocs.io)
|
15 |
+
[](https://codecov.io/gh/ML-KULeuven/socceraction)
|
16 |
+
|
17 |
+
<br/>
|
18 |
+
<br/>
|
19 |
+
</div>
|
20 |
+
|
21 |
+
Socceraction is a Python package for objectively quantifying the impact of the individual actions performed by soccer players using event stream data. The general idea is to assign a value to each on-the-ball action based on the action's impact on the game outcome, while accounting for the context in which the action happened. The video below gives a quick two-minute introduction to action values.
|
22 |
+
|
23 |
+
<div align="center">
|
24 |
+
|
25 |
+
https://user-images.githubusercontent.com/2175271/136857714-1d2c8706-7f2f-449d-818f-0e67fbb75400.mp4
|
26 |
+
|
27 |
+
</div>
|
28 |
+
|
29 |
+
## Features
|
30 |
+
|
31 |
+
Socceraction contains the following components:
|
32 |
+
|
33 |
+
- A set of API clients for **loading event stream data** from StatsBomb, Opta, Wyscout, Stats Perform and WhoScored as Pandas DataFrames using a unified data model. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/data/index.html)
|
34 |
+
- Converters for each of these provider's proprietary data format to the **SPADL** and **atomic-SPADL** formats, which are unified and expressive languages for on-the-ball player actions. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/spadl/index.html)
|
35 |
+
- An implementation of the **Expected Threat (xT)** possession value framework. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/xT.html)
|
36 |
+
- An implementation of the **VAEP** and **Atomic-VAEP** possession value frameworks. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/vaep.html)
|
37 |
+
|
38 |
+
## Installation / Getting started
|
39 |
+
|
40 |
+
The recommended way to install `socceraction` is to simply use pip. The latest version officially supports Python 3.9 - 3.12.
|
41 |
+
|
42 |
+
```sh
|
43 |
+
$ pip install socceraction
|
44 |
+
```
|
45 |
+
|
46 |
+
The folder [`public-notebooks`](https://github.com/ML-KULeuven/socceraction/tree/master/public-notebooks) provides a demo of the full pipeline from raw StatsBomb event stream data to action values and player ratings. More detailed installation/usage instructions can be found in the [Documentation](https://socceraction.readthedocs.io/en/latest/).
|
47 |
+
|
48 |
+
## Contributing
|
49 |
+
|
50 |
+
All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. However, be aware that socceraction is not actively developed. It's primary use is to enable reproducibility of our research. If you believe there is a feature missing, feel free to raise a feature request, but please do be aware that the overwhelming likelihood is that your feature request will not be accepted.
|
51 |
+
To learn more on how to contribute, see the [Contributor Guide](https://socceraction.readthedocs.io/en/latest/development/developer_guide.html).
|
52 |
+
|
53 |
+
## Research
|
54 |
+
|
55 |
+
If you make use of this package in your research, please consider citing the following papers:
|
56 |
+
|
57 |
+
- Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis. **Actions speak louder than goals: Valuing player actions in soccer.** In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1851-1861. 2019. <br/>[ [pdf](http://doi.acm.org/10.1145/3292500.3330758) | [bibtex](https://github.com/ML-KULeuven/socceraction/blob/master/docs/_static/decroos19.bibtex) ]
|
58 |
+
|
59 |
+
- Maaike Van Roy, Pieter Robberechts, Tom Decroos, and Jesse Davis. **Valuing on-the-ball actions in soccer: a critical comparison of XT and VAEP.** In Proceedings of the AAAI-20 Workshop on Artifical Intelligence in Team Sports. AI in Team Sports Organising Committee, 2020. <br/>[ [pdf](https://limo.libis.be/primo-explore/fulldisplay?docid=LIRIAS2913207&context=L&vid=KULeuven&search_scope=ALL_CONTENT&tab=all_content_tab&lang=en_US) | [bibtex](https://github.com/ML-KULeuven/socceraction/blob/master/docs/_static/vanroy20.bibtex) ]
|
60 |
+
|
61 |
+
The Expected Threat (xT) framework was originally introduced by Karun Singh on his [blog](https://karun.in/blog/expected-threat.html) in 2019.
|
62 |
+
|
63 |
+
## License
|
64 |
+
|
65 |
+
Distributed under the terms of the [MIT license](https://opensource.org/licenses/MIT),
|
66 |
+
socceraction is free and open source software. Although not strictly required, we appreciate it if you include a link to this repo or cite our research in your work if you make use of socceraction.
|
__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
SoccerAction
|
3 |
+
~~~~~~~~~~~~
|
4 |
+
|
5 |
+
SoccerAction is a Python package for objectively quantifying the impact of the
|
6 |
+
individual actions performed by soccer players using event stream data.
|
7 |
+
|
8 |
+
Full documentation is at <https://ml-kuleuven.github.io/socceraction/>.
|
9 |
+
:copyright: (c) 2020 by DTAI KU Leuven.
|
10 |
+
:license: MIT, see LICENSE for more details.
|
11 |
+
"""
|
12 |
+
|
13 |
+
__version__ = "1.5.3"
|
atomic/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Implements Atomic-SPADL and the Atomic-VAEP framework."""
|
atomic/spadl/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implementation of the Atomic-SPADL language."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"convert_to_atomic",
|
5 |
+
"AtomicSPADLSchema",
|
6 |
+
"bodyparts_df",
|
7 |
+
"actiontypes_df",
|
8 |
+
"add_names",
|
9 |
+
"play_left_to_right",
|
10 |
+
]
|
11 |
+
|
12 |
+
from .base import convert_to_atomic
|
13 |
+
from .config import actiontypes_df, bodyparts_df
|
14 |
+
from .schema import AtomicSPADLSchema
|
15 |
+
from .utils import add_names, play_left_to_right
|
atomic/spadl/base.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements a converter for regular SPADL actions to atomic actions."""
|
2 |
+
|
3 |
+
from typing import cast
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from pandera.typing import DataFrame
|
7 |
+
|
8 |
+
import socceraction.spadl.config as _spadl
|
9 |
+
from socceraction.spadl.base import _add_dribbles
|
10 |
+
from socceraction.spadl.schema import SPADLSchema
|
11 |
+
|
12 |
+
from . import config as _atomicspadl
|
13 |
+
from .schema import AtomicSPADLSchema
|
14 |
+
|
15 |
+
|
16 |
+
def convert_to_atomic(actions: DataFrame[SPADLSchema]) -> DataFrame[AtomicSPADLSchema]:
|
17 |
+
"""Convert regular SPADL actions to atomic actions.
|
18 |
+
|
19 |
+
Parameters
|
20 |
+
----------
|
21 |
+
actions : pd.DataFrame
|
22 |
+
A SPADL dataframe.
|
23 |
+
|
24 |
+
Returns
|
25 |
+
-------
|
26 |
+
pd.DataFrame
|
27 |
+
The Atomic-SPADL dataframe.
|
28 |
+
"""
|
29 |
+
atomic_actions = cast(pd.DataFrame, actions.copy())
|
30 |
+
atomic_actions = _extra_from_passes(atomic_actions)
|
31 |
+
atomic_actions = _add_dribbles(atomic_actions) # for some reason this adds more dribbles
|
32 |
+
atomic_actions = _extra_from_shots(atomic_actions)
|
33 |
+
atomic_actions = _extra_from_fouls(atomic_actions)
|
34 |
+
atomic_actions = _convert_columns(atomic_actions)
|
35 |
+
atomic_actions = _simplify(atomic_actions)
|
36 |
+
return cast(DataFrame[AtomicSPADLSchema], atomic_actions)
|
37 |
+
|
38 |
+
|
39 |
+
def _extra_from_passes(actions: pd.DataFrame) -> pd.DataFrame:
|
40 |
+
next_actions = actions.shift(-1)
|
41 |
+
same_team = actions.team_id == next_actions.team_id
|
42 |
+
|
43 |
+
passlike = [
|
44 |
+
"pass",
|
45 |
+
"cross",
|
46 |
+
"throw_in",
|
47 |
+
"freekick_short",
|
48 |
+
"freekick_crossed",
|
49 |
+
"corner_crossed",
|
50 |
+
"corner_short",
|
51 |
+
"clearance",
|
52 |
+
"goalkick",
|
53 |
+
]
|
54 |
+
pass_ids = [_spadl.actiontypes.index(ty) for ty in passlike]
|
55 |
+
|
56 |
+
interceptionlike = [
|
57 |
+
"interception",
|
58 |
+
"tackle",
|
59 |
+
"keeper_punch",
|
60 |
+
"keeper_save",
|
61 |
+
"keeper_claim",
|
62 |
+
"keeper_pick_up",
|
63 |
+
]
|
64 |
+
interception_ids = [_spadl.actiontypes.index(ty) for ty in interceptionlike]
|
65 |
+
|
66 |
+
samegame = actions.game_id == next_actions.game_id
|
67 |
+
sameperiod = actions.period_id == next_actions.period_id
|
68 |
+
# samephase = next_actions.time_seconds - actions.time_seconds < max_pass_duration
|
69 |
+
extra_idx = (
|
70 |
+
actions.type_id.isin(pass_ids)
|
71 |
+
& samegame
|
72 |
+
& sameperiod # & samephase
|
73 |
+
& ~next_actions.type_id.isin(interception_ids)
|
74 |
+
)
|
75 |
+
|
76 |
+
prev = actions[extra_idx]
|
77 |
+
nex = next_actions[extra_idx]
|
78 |
+
|
79 |
+
extra = pd.DataFrame()
|
80 |
+
extra["game_id"] = prev.game_id
|
81 |
+
extra["original_event_id"] = prev.original_event_id
|
82 |
+
extra["period_id"] = prev.period_id
|
83 |
+
extra["action_id"] = prev.action_id + 0.1
|
84 |
+
extra["time_seconds"] = (prev.time_seconds + nex.time_seconds) / 2
|
85 |
+
extra["start_x"] = prev.end_x
|
86 |
+
extra["start_y"] = prev.end_y
|
87 |
+
extra["end_x"] = prev.end_x
|
88 |
+
extra["end_y"] = prev.end_y
|
89 |
+
extra["bodypart_id"] = _atomicspadl.bodyparts.index("foot")
|
90 |
+
extra["result_id"] = -1
|
91 |
+
|
92 |
+
offside = prev.result_id == _spadl.results.index("offside")
|
93 |
+
out = ((nex.type_id == _atomicspadl.actiontypes.index("goalkick")) & (~same_team)) | (
|
94 |
+
nex.type_id == _atomicspadl.actiontypes.index("throw_in")
|
95 |
+
)
|
96 |
+
ar = _atomicspadl.actiontypes
|
97 |
+
extra["type_id"] = -1
|
98 |
+
extra["type_id"] = (
|
99 |
+
extra.type_id.mask(same_team, ar.index("receival"))
|
100 |
+
.mask(~same_team, ar.index("interception"))
|
101 |
+
.mask(out, ar.index("out"))
|
102 |
+
.mask(offside, ar.index("offside"))
|
103 |
+
)
|
104 |
+
is_interception = extra["type_id"] == ar.index("interception")
|
105 |
+
extra["team_id"] = prev.team_id.mask(is_interception, nex.team_id)
|
106 |
+
extra["player_id"] = nex.player_id.mask(out | offside, prev.player_id).astype(
|
107 |
+
prev.player_id.dtype
|
108 |
+
)
|
109 |
+
|
110 |
+
actions = pd.concat([actions, extra], ignore_index=True, sort=False)
|
111 |
+
actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
|
112 |
+
actions["action_id"] = range(len(actions))
|
113 |
+
return actions
|
114 |
+
|
115 |
+
|
116 |
+
def _extra_from_shots(actions: pd.DataFrame) -> pd.DataFrame:
|
117 |
+
next_actions = actions.shift(-1)
|
118 |
+
|
119 |
+
shotlike = ["shot", "shot_freekick", "shot_penalty"]
|
120 |
+
shot_ids = [_spadl.actiontypes.index(ty) for ty in shotlike]
|
121 |
+
|
122 |
+
samegame = actions.game_id == next_actions.game_id
|
123 |
+
sameperiod = actions.period_id == next_actions.period_id
|
124 |
+
|
125 |
+
shot = actions.type_id.isin(shot_ids)
|
126 |
+
goal = shot & (actions.result_id == _spadl.results.index("success"))
|
127 |
+
owngoal = actions.result_id == _spadl.results.index("owngoal")
|
128 |
+
next_corner_goalkick = next_actions.type_id.isin(
|
129 |
+
[
|
130 |
+
_atomicspadl.actiontypes.index("corner_crossed"),
|
131 |
+
_atomicspadl.actiontypes.index("corner_short"),
|
132 |
+
_atomicspadl.actiontypes.index("goalkick"),
|
133 |
+
]
|
134 |
+
)
|
135 |
+
out = shot & next_corner_goalkick & samegame & sameperiod
|
136 |
+
|
137 |
+
extra_idx = goal | owngoal | out
|
138 |
+
prev = actions[extra_idx]
|
139 |
+
# nex = next_actions[extra_idx]
|
140 |
+
|
141 |
+
extra = pd.DataFrame()
|
142 |
+
extra["game_id"] = prev.game_id
|
143 |
+
extra["original_event_id"] = prev.original_event_id
|
144 |
+
extra["period_id"] = prev.period_id
|
145 |
+
extra["action_id"] = prev.action_id + 0.1
|
146 |
+
extra["time_seconds"] = prev.time_seconds # + nex.time_seconds) / 2
|
147 |
+
extra["start_x"] = prev.end_x
|
148 |
+
extra["start_y"] = prev.end_y
|
149 |
+
extra["end_x"] = prev.end_x
|
150 |
+
extra["end_y"] = prev.end_y
|
151 |
+
extra["bodypart_id"] = prev.bodypart_id
|
152 |
+
extra["result_id"] = -1
|
153 |
+
extra["team_id"] = prev.team_id
|
154 |
+
extra["player_id"] = prev.player_id
|
155 |
+
|
156 |
+
ar = _atomicspadl.actiontypes
|
157 |
+
extra["type_id"] = -1
|
158 |
+
extra["type_id"] = (
|
159 |
+
extra.type_id.mask(out, ar.index("out"))
|
160 |
+
.mask(goal, ar.index("goal"))
|
161 |
+
.mask(owngoal, ar.index("owngoal"))
|
162 |
+
)
|
163 |
+
actions = pd.concat([actions, extra], ignore_index=True, sort=False)
|
164 |
+
actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
|
165 |
+
actions["action_id"] = range(len(actions))
|
166 |
+
return actions
|
167 |
+
|
168 |
+
|
169 |
+
def _extra_from_fouls(actions: pd.DataFrame) -> pd.DataFrame:
|
170 |
+
yellow = actions.result_id == _spadl.results.index("yellow_card")
|
171 |
+
red = actions.result_id == _spadl.results.index("red_card")
|
172 |
+
|
173 |
+
prev = actions[yellow | red]
|
174 |
+
extra = pd.DataFrame()
|
175 |
+
extra["game_id"] = prev.game_id
|
176 |
+
extra["original_event_id"] = prev.original_event_id
|
177 |
+
extra["period_id"] = prev.period_id
|
178 |
+
extra["action_id"] = prev.action_id + 0.1
|
179 |
+
extra["time_seconds"] = prev.time_seconds # + nex.time_seconds) / 2
|
180 |
+
extra["start_x"] = prev.end_x
|
181 |
+
extra["start_y"] = prev.end_y
|
182 |
+
extra["end_x"] = prev.end_x
|
183 |
+
extra["end_y"] = prev.end_y
|
184 |
+
extra["bodypart_id"] = prev.bodypart_id
|
185 |
+
extra["result_id"] = -1
|
186 |
+
extra["team_id"] = prev.team_id
|
187 |
+
extra["player_id"] = prev.player_id
|
188 |
+
|
189 |
+
ar = _atomicspadl.actiontypes
|
190 |
+
extra["type_id"] = -1
|
191 |
+
extra["type_id"] = extra.type_id.mask(yellow, ar.index("yellow_card")).mask(
|
192 |
+
red, ar.index("red_card")
|
193 |
+
)
|
194 |
+
actions = pd.concat([actions, extra], ignore_index=True, sort=False)
|
195 |
+
actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
|
196 |
+
actions["action_id"] = range(len(actions))
|
197 |
+
return actions
|
198 |
+
|
199 |
+
|
200 |
+
def _convert_columns(actions: pd.DataFrame) -> pd.DataFrame:
|
201 |
+
actions["x"] = actions.start_x
|
202 |
+
actions["y"] = actions.start_y
|
203 |
+
actions["dx"] = actions.end_x - actions.start_x
|
204 |
+
actions["dy"] = actions.end_y - actions.start_y
|
205 |
+
return actions[
|
206 |
+
[
|
207 |
+
"game_id",
|
208 |
+
"original_event_id",
|
209 |
+
"action_id",
|
210 |
+
"period_id",
|
211 |
+
"time_seconds",
|
212 |
+
"team_id",
|
213 |
+
"player_id",
|
214 |
+
"x",
|
215 |
+
"y",
|
216 |
+
"dx",
|
217 |
+
"dy",
|
218 |
+
"type_id",
|
219 |
+
"bodypart_id",
|
220 |
+
]
|
221 |
+
]
|
222 |
+
|
223 |
+
|
224 |
+
def _simplify(actions: pd.DataFrame) -> pd.DataFrame:
|
225 |
+
a = actions
|
226 |
+
ar = _atomicspadl.actiontypes
|
227 |
+
|
228 |
+
cornerlike = ["corner_crossed", "corner_short"]
|
229 |
+
corner_ids = [_spadl.actiontypes.index(ty) for ty in cornerlike]
|
230 |
+
|
231 |
+
freekicklike = ["freekick_crossed", "freekick_short", "shot_freekick"]
|
232 |
+
freekick_ids = [_spadl.actiontypes.index(ty) for ty in freekicklike]
|
233 |
+
|
234 |
+
a["type_id"] = a.type_id.mask(a.type_id.isin(corner_ids), ar.index("corner"))
|
235 |
+
a["type_id"] = a.type_id.mask(a.type_id.isin(freekick_ids), ar.index("freekick"))
|
236 |
+
return a
|
atomic/spadl/config.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Configuration of the Atomic-SPADL language.
|
2 |
+
|
3 |
+
Attributes
|
4 |
+
----------
|
5 |
+
field_length : float
|
6 |
+
The length of a pitch (in meters).
|
7 |
+
field_width : float
|
8 |
+
The width of a pitch (in meters).
|
9 |
+
bodyparts : list(str)
|
10 |
+
The bodyparts used in the Atomic-SPADL language.
|
11 |
+
actiontypes : list(str)
|
12 |
+
The action types used in the Atomic-SPADL language.
|
13 |
+
|
14 |
+
"""
|
15 |
+
|
16 |
+
import pandas as pd
|
17 |
+
|
18 |
+
import socceraction.spadl.config as _spadl
|
19 |
+
|
20 |
+
field_length = _spadl.field_length
|
21 |
+
field_width = _spadl.field_width
|
22 |
+
|
23 |
+
bodyparts = _spadl.bodyparts
|
24 |
+
bodyparts_df = _spadl.bodyparts_df
|
25 |
+
|
26 |
+
actiontypes = _spadl.actiontypes + [
|
27 |
+
"receival",
|
28 |
+
"interception",
|
29 |
+
"out",
|
30 |
+
"offside",
|
31 |
+
"goal",
|
32 |
+
"owngoal",
|
33 |
+
"yellow_card",
|
34 |
+
"red_card",
|
35 |
+
"corner",
|
36 |
+
"freekick",
|
37 |
+
]
|
38 |
+
|
39 |
+
|
40 |
+
def actiontypes_df() -> pd.DataFrame:
|
41 |
+
"""Return a dataframe with the type id and type name of each Atomic-SPADL action type.
|
42 |
+
|
43 |
+
Returns
|
44 |
+
-------
|
45 |
+
pd.DataFrame
|
46 |
+
The 'type_id' and 'type_name' of each Atomic-SPADL action type.
|
47 |
+
"""
|
48 |
+
return pd.DataFrame(list(enumerate(actiontypes)), columns=["type_id", "type_name"])
|
atomic/spadl/schema.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Schema for Atomic-SPADL actions."""
|
2 |
+
|
3 |
+
from typing import Any, Optional
|
4 |
+
|
5 |
+
import pandera as pa
|
6 |
+
from pandera.typing import Series
|
7 |
+
|
8 |
+
from . import config as spadlconfig
|
9 |
+
|
10 |
+
|
11 |
+
class AtomicSPADLSchema(pa.SchemaModel):
|
12 |
+
"""Definition of an Atomic-SPADL dataframe."""
|
13 |
+
|
14 |
+
game_id: Series[Any] = pa.Field()
|
15 |
+
original_event_id: Series[Any] = pa.Field(nullable=True)
|
16 |
+
action_id: Series[int] = pa.Field()
|
17 |
+
period_id: Series[int] = pa.Field(ge=1, le=5)
|
18 |
+
time_seconds: Series[float] = pa.Field(ge=0)
|
19 |
+
team_id: Series[Any] = pa.Field()
|
20 |
+
player_id: Series[Any] = pa.Field()
|
21 |
+
x: Series[float] = pa.Field(ge=0, le=spadlconfig.field_length)
|
22 |
+
y: Series[float] = pa.Field(ge=0, le=spadlconfig.field_width)
|
23 |
+
dx: Series[float] = pa.Field(ge=-spadlconfig.field_length, le=spadlconfig.field_length)
|
24 |
+
dy: Series[float] = pa.Field(ge=-spadlconfig.field_width, le=spadlconfig.field_width)
|
25 |
+
bodypart_id: Series[int] = pa.Field(isin=spadlconfig.bodyparts_df().bodypart_id)
|
26 |
+
bodypart_name: Optional[Series[str]] = pa.Field(isin=spadlconfig.bodyparts_df().bodypart_name)
|
27 |
+
type_id: Series[int] = pa.Field(isin=spadlconfig.actiontypes_df().type_id)
|
28 |
+
type_name: Optional[Series[str]] = pa.Field(isin=spadlconfig.actiontypes_df().type_name)
|
29 |
+
|
30 |
+
class Config: # noqa: D106
|
31 |
+
strict = True
|
32 |
+
coerce = True
|
atomic/spadl/utils.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions for working with Atomic-SPADL dataframes."""
|
2 |
+
|
3 |
+
from typing import cast
|
4 |
+
|
5 |
+
from pandera.typing import DataFrame
|
6 |
+
|
7 |
+
from . import config as spadlconfig
|
8 |
+
from .schema import AtomicSPADLSchema
|
9 |
+
|
10 |
+
|
11 |
+
def add_names(actions: DataFrame[AtomicSPADLSchema]) -> DataFrame[AtomicSPADLSchema]:
|
12 |
+
"""Add the type name, result name and bodypart name to an Atomic-SPADL dataframe.
|
13 |
+
|
14 |
+
Parameters
|
15 |
+
----------
|
16 |
+
actions : pd.DataFrame
|
17 |
+
An Atomic-SPADL dataframe.
|
18 |
+
|
19 |
+
Returns
|
20 |
+
-------
|
21 |
+
pd.DataFrame
|
22 |
+
The original dataframe with a 'type_name', 'result_name' and
|
23 |
+
'bodypart_name' appended.
|
24 |
+
"""
|
25 |
+
return cast(
|
26 |
+
DataFrame[AtomicSPADLSchema],
|
27 |
+
actions.drop(columns=["type_name", "bodypart_name"], errors="ignore")
|
28 |
+
.merge(spadlconfig.actiontypes_df(), how="left")
|
29 |
+
.merge(spadlconfig.bodyparts_df(), how="left")
|
30 |
+
.set_index(actions.index),
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
def play_left_to_right(
|
35 |
+
actions: DataFrame[AtomicSPADLSchema], home_team_id: int
|
36 |
+
) -> DataFrame[AtomicSPADLSchema]:
|
37 |
+
"""Perform all action in the same playing direction.
|
38 |
+
|
39 |
+
This changes the location of each action, such that all actions
|
40 |
+
are performed as if the team that executes the action plays from left to
|
41 |
+
right.
|
42 |
+
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
actions : pd.DataFrame
|
46 |
+
The SPADL actins of a game.
|
47 |
+
home_team_id : int
|
48 |
+
The ID of the home team.
|
49 |
+
|
50 |
+
Returns
|
51 |
+
-------
|
52 |
+
list(pd.DataFrame)
|
53 |
+
All actions performed left to right.
|
54 |
+
|
55 |
+
See Also
|
56 |
+
--------
|
57 |
+
socceraction.atomic.vaep.features.play_left_to_right : For transforming gamestates.
|
58 |
+
"""
|
59 |
+
ltr_actions = actions.copy()
|
60 |
+
away_idx = actions.team_id != home_team_id
|
61 |
+
ltr_actions.loc[away_idx, "x"] = spadlconfig.field_length - actions[away_idx]["x"].values
|
62 |
+
ltr_actions.loc[away_idx, "y"] = spadlconfig.field_width - actions[away_idx]["y"].values
|
63 |
+
ltr_actions.loc[away_idx, "dx"] = -actions[away_idx]["dx"].values
|
64 |
+
ltr_actions.loc[away_idx, "dy"] = -actions[away_idx]["dy"].values
|
65 |
+
return ltr_actions
|
atomic/vaep/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements the Atomic-VAEP framework."""
|
2 |
+
|
3 |
+
from . import features, formula, labels
|
4 |
+
from .base import AtomicVAEP
|
5 |
+
|
6 |
+
__all__ = ["AtomicVAEP", "features", "labels", "formula"]
|
atomic/vaep/base.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements the Atomic-VAEP framework.
|
2 |
+
|
3 |
+
Attributes
|
4 |
+
----------
|
5 |
+
xfns_default : list(callable)
|
6 |
+
The default VAEP features.
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
from typing import Optional
|
11 |
+
|
12 |
+
import socceraction.atomic.spadl as spadlcfg
|
13 |
+
from socceraction.vaep.base import VAEP
|
14 |
+
|
15 |
+
from . import features as fs
|
16 |
+
from . import formula as vaep
|
17 |
+
from . import labels as lab
|
18 |
+
|
19 |
+
xfns_default = [
|
20 |
+
fs.actiontype,
|
21 |
+
fs.actiontype_onehot,
|
22 |
+
fs.bodypart,
|
23 |
+
fs.bodypart_onehot,
|
24 |
+
fs.time,
|
25 |
+
fs.team,
|
26 |
+
fs.time_delta,
|
27 |
+
fs.location,
|
28 |
+
fs.polar,
|
29 |
+
fs.movement_polar,
|
30 |
+
fs.direction,
|
31 |
+
fs.goalscore,
|
32 |
+
]
|
33 |
+
|
34 |
+
|
35 |
+
class AtomicVAEP(VAEP):
|
36 |
+
"""
|
37 |
+
An implementation of the VAEP framework for atomic actions.
|
38 |
+
|
39 |
+
In contrast to the original VAEP framework [1]_ this extension
|
40 |
+
distinguishes the contribution of the player who initiates the action
|
41 |
+
(e.g., gives the pass) and the player who completes the action (e.g.,
|
42 |
+
receives the pass) [2]_.
|
43 |
+
|
44 |
+
Parameters
|
45 |
+
----------
|
46 |
+
xfns : list
|
47 |
+
List of feature transformers (see :mod:`socceraction.atomic.vaep.features`)
|
48 |
+
used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default`
|
49 |
+
if None.
|
50 |
+
nb_prev_actions : int, default=3
|
51 |
+
Number of previous actions used to decscribe the game state.
|
52 |
+
|
53 |
+
See Also
|
54 |
+
--------
|
55 |
+
:class:`socceraction.vaep.VAEP` : Implementation of the original VAEP framework.
|
56 |
+
|
57 |
+
References
|
58 |
+
----------
|
59 |
+
.. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis.
|
60 |
+
"Actions speak louder than goals: Valuing player actions in soccer." In
|
61 |
+
Proceedings of the 25th ACM SIGKDD International Conference on Knowledge
|
62 |
+
Discovery & Data Mining, pp. 1851-1861. 2019.
|
63 |
+
.. [2] Tom Decroos, Pieter Robberechts and Jesse Davis.
|
64 |
+
"Introducing Atomic-SPADL: A New Way to Represent Event Stream Data".
|
65 |
+
DTAI Sports Analytics Blog. https://dtai.cs.kuleuven.be/sports/blog/introducing-atomic-spadl:-a-new-way-to-represent-event-stream-data # noqa
|
66 |
+
May 2020.
|
67 |
+
"""
|
68 |
+
|
69 |
+
_spadlcfg = spadlcfg
|
70 |
+
_lab = lab
|
71 |
+
_fs = fs
|
72 |
+
_vaep = vaep
|
73 |
+
|
74 |
+
def __init__(
|
75 |
+
self,
|
76 |
+
xfns: Optional[list[fs.FeatureTransfomer]] = None,
|
77 |
+
nb_prev_actions: int = 3,
|
78 |
+
) -> None:
|
79 |
+
xfns = xfns_default if xfns is None else xfns
|
80 |
+
super().__init__(xfns, nb_prev_actions)
|
atomic/vaep/features.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements the feature tranformers of the VAEP framework."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Union
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from pandera.typing import DataFrame
|
8 |
+
|
9 |
+
import socceraction.atomic.spadl.config as atomicspadl
|
10 |
+
from socceraction.atomic.spadl import AtomicSPADLSchema
|
11 |
+
from socceraction.spadl import SPADLSchema
|
12 |
+
from socceraction.vaep.features import (
|
13 |
+
actiontype,
|
14 |
+
bodypart,
|
15 |
+
bodypart_detailed,
|
16 |
+
bodypart_detailed_onehot,
|
17 |
+
bodypart_onehot,
|
18 |
+
gamestates,
|
19 |
+
player_possession_time,
|
20 |
+
simple,
|
21 |
+
speed,
|
22 |
+
team,
|
23 |
+
time,
|
24 |
+
time_delta,
|
25 |
+
)
|
26 |
+
|
27 |
+
__all__ = [
|
28 |
+
"feature_column_names",
|
29 |
+
"play_left_to_right",
|
30 |
+
"gamestates",
|
31 |
+
"actiontype",
|
32 |
+
"actiontype_onehot",
|
33 |
+
"bodypart",
|
34 |
+
"bodypart_detailed",
|
35 |
+
"bodypart_onehot",
|
36 |
+
"bodypart_detailed_onehot",
|
37 |
+
"team",
|
38 |
+
"time",
|
39 |
+
"time_delta",
|
40 |
+
"speed",
|
41 |
+
"location",
|
42 |
+
"polar",
|
43 |
+
"movement_polar",
|
44 |
+
"direction",
|
45 |
+
"goalscore",
|
46 |
+
"player_possession_time",
|
47 |
+
]
|
48 |
+
|
49 |
+
Actions = Union[DataFrame[SPADLSchema], DataFrame[AtomicSPADLSchema]]
|
50 |
+
GameStates = list[Actions]
|
51 |
+
Features = DataFrame[Any]
|
52 |
+
FeatureTransfomer = Callable[[GameStates], Features]
|
53 |
+
|
54 |
+
|
55 |
+
def feature_column_names(fs: list[FeatureTransfomer], nb_prev_actions: int = 3) -> list[str]:
|
56 |
+
"""Return the names of the features generated by a list of transformers.
|
57 |
+
|
58 |
+
Parameters
|
59 |
+
----------
|
60 |
+
fs : list(callable)
|
61 |
+
A list of feature transformers.
|
62 |
+
nb_prev_actions : int, default=3 # noqa: DAR103
|
63 |
+
The number of previous actions included in the game state.
|
64 |
+
|
65 |
+
Returns
|
66 |
+
-------
|
67 |
+
list(str)
|
68 |
+
The name of each generated feature.
|
69 |
+
"""
|
70 |
+
spadlcolumns = [
|
71 |
+
"game_id",
|
72 |
+
"original_event_id",
|
73 |
+
"action_id",
|
74 |
+
"period_id",
|
75 |
+
"time_seconds",
|
76 |
+
"team_id",
|
77 |
+
"player_id",
|
78 |
+
"x",
|
79 |
+
"y",
|
80 |
+
"dx",
|
81 |
+
"dy",
|
82 |
+
"bodypart_id",
|
83 |
+
"bodypart_name",
|
84 |
+
"type_id",
|
85 |
+
"type_name",
|
86 |
+
]
|
87 |
+
dummy_actions = pd.DataFrame(np.zeros((10, len(spadlcolumns))), columns=spadlcolumns)
|
88 |
+
for c in spadlcolumns:
|
89 |
+
if "name" in c:
|
90 |
+
dummy_actions[c] = dummy_actions[c].astype(str)
|
91 |
+
gs = gamestates(dummy_actions, nb_prev_actions) # type: ignore
|
92 |
+
return list(pd.concat([f(gs) for f in fs], axis=1).columns)
|
93 |
+
|
94 |
+
|
95 |
+
def play_left_to_right(gamestates: GameStates, home_team_id: int) -> GameStates:
|
96 |
+
"""Perform all action in the same playing direction.
|
97 |
+
|
98 |
+
This changes the start and end location of each action, such that all actions
|
99 |
+
are performed as if the team plays from left to right.
|
100 |
+
|
101 |
+
Parameters
|
102 |
+
----------
|
103 |
+
gamestates : GameStates
|
104 |
+
The game states of a game.
|
105 |
+
home_team_id : int
|
106 |
+
The ID of the home team.
|
107 |
+
|
108 |
+
Returns
|
109 |
+
-------
|
110 |
+
list(pd.DataFrame)
|
111 |
+
The game states with all actions performed left to right.
|
112 |
+
"""
|
113 |
+
a0 = gamestates[0]
|
114 |
+
away_idx = a0.team_id != home_team_id
|
115 |
+
for actions in gamestates:
|
116 |
+
actions.loc[away_idx, "x"] = atomicspadl.field_length - actions[away_idx]["x"].values
|
117 |
+
actions.loc[away_idx, "y"] = atomicspadl.field_width - actions[away_idx]["y"].values
|
118 |
+
actions.loc[away_idx, "dx"] = -actions[away_idx]["dx"].values
|
119 |
+
actions.loc[away_idx, "dy"] = -actions[away_idx]["dy"].values
|
120 |
+
return gamestates
|
121 |
+
|
122 |
+
|
123 |
+
@simple
|
124 |
+
def actiontype_onehot(actions: Actions) -> Features:
|
125 |
+
"""Get the one-hot-encoded type of each action.
|
126 |
+
|
127 |
+
Parameters
|
128 |
+
----------
|
129 |
+
actions : Actions
|
130 |
+
The actions of a game.
|
131 |
+
|
132 |
+
Returns
|
133 |
+
-------
|
134 |
+
Features
|
135 |
+
A one-hot encoding of each action's type.
|
136 |
+
"""
|
137 |
+
X = {}
|
138 |
+
for type_id, type_name in enumerate(atomicspadl.actiontypes):
|
139 |
+
col = "actiontype_" + type_name
|
140 |
+
X[col] = actions["type_id"] == type_id
|
141 |
+
return pd.DataFrame(X, index=actions.index)
|
142 |
+
|
143 |
+
|
144 |
+
@simple
|
145 |
+
def location(actions: Actions) -> Features:
|
146 |
+
"""Get the location where each action started.
|
147 |
+
|
148 |
+
Parameters
|
149 |
+
----------
|
150 |
+
actions : Actions
|
151 |
+
The actions of a game.
|
152 |
+
|
153 |
+
Returns
|
154 |
+
-------
|
155 |
+
Features
|
156 |
+
The 'x' and 'y' location of each action.
|
157 |
+
"""
|
158 |
+
return actions[["x", "y"]]
|
159 |
+
|
160 |
+
|
161 |
+
_goal_x = atomicspadl.field_length
|
162 |
+
_goal_y = atomicspadl.field_width / 2
|
163 |
+
|
164 |
+
|
165 |
+
@simple
|
166 |
+
def polar(actions: Actions) -> Features:
|
167 |
+
"""Get the polar coordinates of each action's start location.
|
168 |
+
|
169 |
+
The center of the opponent's goal is used as the origin.
|
170 |
+
|
171 |
+
Parameters
|
172 |
+
----------
|
173 |
+
actions : Actions
|
174 |
+
The actions of a game.
|
175 |
+
|
176 |
+
Returns
|
177 |
+
-------
|
178 |
+
Features
|
179 |
+
The 'dist_to_goal' and 'angle_to_goal' of each action.
|
180 |
+
"""
|
181 |
+
polardf = pd.DataFrame(index=actions.index)
|
182 |
+
dx = (_goal_x - actions["x"]).abs().values
|
183 |
+
dy = (_goal_y - actions["y"]).abs().values
|
184 |
+
polardf["dist_to_goal"] = np.sqrt(dx**2 + dy**2)
|
185 |
+
with np.errstate(divide="ignore", invalid="ignore"):
|
186 |
+
polardf["angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
|
187 |
+
return polardf
|
188 |
+
|
189 |
+
|
190 |
+
@simple
|
191 |
+
def movement_polar(actions: Actions) -> Features:
|
192 |
+
"""Get the distance covered and direction of each action.
|
193 |
+
|
194 |
+
Parameters
|
195 |
+
----------
|
196 |
+
actions : Actions
|
197 |
+
The actions of a game.
|
198 |
+
|
199 |
+
Returns
|
200 |
+
-------
|
201 |
+
Features
|
202 |
+
The distance covered ('mov_d') and direction ('mov_angle') of each action.
|
203 |
+
"""
|
204 |
+
mov = pd.DataFrame(index=actions.index)
|
205 |
+
mov["mov_d"] = np.sqrt(actions.dx**2 + actions.dy**2)
|
206 |
+
with np.errstate(divide="ignore", invalid="ignore"):
|
207 |
+
mov["mov_angle"] = np.arctan2(actions.dy, actions.dx)
|
208 |
+
mov.loc[actions.dy == 0, "mov_angle"] = 0 # fix float errors
|
209 |
+
return mov
|
210 |
+
|
211 |
+
|
212 |
+
@simple
|
213 |
+
def direction(actions: Actions) -> Features:
|
214 |
+
"""Get the direction of the action as components of the unit vector.
|
215 |
+
|
216 |
+
Parameters
|
217 |
+
----------
|
218 |
+
actions : Actions
|
219 |
+
The actions of a game.
|
220 |
+
|
221 |
+
Returns
|
222 |
+
-------
|
223 |
+
Features
|
224 |
+
The x-component ('dx') and y-compoment ('mov_angle') of the unit
|
225 |
+
vector of each action.
|
226 |
+
"""
|
227 |
+
mov = pd.DataFrame(index=actions.index)
|
228 |
+
totald = np.sqrt(actions.dx**2 + actions.dy**2)
|
229 |
+
for d in ["dx", "dy"]:
|
230 |
+
# we don't want to give away the end location,
|
231 |
+
# just the direction of the ball
|
232 |
+
# We also don't want to divide by zero
|
233 |
+
mov[d] = actions[d].mask(totald > 0, actions[d] / totald)
|
234 |
+
|
235 |
+
return mov
|
236 |
+
|
237 |
+
|
238 |
+
def goalscore(gamestates: GameStates) -> Features:
|
239 |
+
"""Get the number of goals scored by each team after the action.
|
240 |
+
|
241 |
+
Parameters
|
242 |
+
----------
|
243 |
+
gamestates : GameStates
|
244 |
+
The gamestates of a game.
|
245 |
+
|
246 |
+
Returns
|
247 |
+
-------
|
248 |
+
Features
|
249 |
+
The number of goals scored by the team performing the last action of the
|
250 |
+
game state ('goalscore_team'), by the opponent ('goalscore_opponent'),
|
251 |
+
and the goal difference between both teams ('goalscore_diff').
|
252 |
+
"""
|
253 |
+
actions = gamestates[0]
|
254 |
+
teamA = actions["team_id"].values[0]
|
255 |
+
goals = actions.type_name == "goal"
|
256 |
+
owngoals = actions["type_name"].str.contains("owngoal")
|
257 |
+
|
258 |
+
teamisA = actions["team_id"] == teamA
|
259 |
+
teamisB = ~teamisA
|
260 |
+
goalsteamA = (goals & teamisA) | (owngoals & teamisB)
|
261 |
+
goalsteamB = (goals & teamisB) | (owngoals & teamisA)
|
262 |
+
goalscoreteamA = goalsteamA.cumsum() - goalsteamA
|
263 |
+
goalscoreteamB = goalsteamB.cumsum() - goalsteamB
|
264 |
+
|
265 |
+
scoredf = pd.DataFrame(index=actions.index)
|
266 |
+
scoredf["goalscore_team"] = (goalscoreteamA * teamisA) + (goalscoreteamB * teamisB)
|
267 |
+
scoredf["goalscore_opponent"] = (goalscoreteamB * teamisA) + (goalscoreteamA * teamisB)
|
268 |
+
scoredf["goalscore_diff"] = scoredf["goalscore_team"] - scoredf["goalscore_opponent"]
|
269 |
+
return scoredf
|
atomic/vaep/formula.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements the formula of the Atomic-VAEP framework."""
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from pandera.typing import DataFrame, Series
|
5 |
+
|
6 |
+
from socceraction.atomic.spadl import AtomicSPADLSchema
|
7 |
+
|
8 |
+
|
9 |
+
def _prev(x: pd.Series) -> pd.Series:
|
10 |
+
prev_x = x.shift(1)
|
11 |
+
prev_x[:1] = x.values[0]
|
12 |
+
return prev_x
|
13 |
+
|
14 |
+
|
15 |
+
def offensive_value(
|
16 |
+
actions: DataFrame[AtomicSPADLSchema], scores: Series[float], concedes: Series[float]
|
17 |
+
) -> Series[float]:
|
18 |
+
r"""Compute the offensive value of each action.
|
19 |
+
|
20 |
+
VAEP defines the *offensive value* of an action as the change in scoring
|
21 |
+
probability before and after the action.
|
22 |
+
|
23 |
+
.. math::
|
24 |
+
|
25 |
+
\Delta P_{score}(a_{i}, t) = P^{k}_{score}(S_i, t) - P^{k}_{score}(S_{i-1}, t)
|
26 |
+
|
27 |
+
where :math:`P_{score}(S_i, t)` is the probability that team :math:`t`
|
28 |
+
which possesses the ball in state :math:`S_i` will score in the next 10
|
29 |
+
actions.
|
30 |
+
|
31 |
+
Parameters
|
32 |
+
----------
|
33 |
+
actions : pd.DataFrame
|
34 |
+
SPADL action.
|
35 |
+
scores : pd.Series
|
36 |
+
The probability of scoring from each corresponding game state.
|
37 |
+
concedes : pd.Series
|
38 |
+
The probability of conceding from each corresponding game state.
|
39 |
+
|
40 |
+
Returns
|
41 |
+
-------
|
42 |
+
pd.Series
|
43 |
+
he ffensive value of each action.
|
44 |
+
"""
|
45 |
+
sameteam = _prev(actions.team_id) == actions.team_id
|
46 |
+
prev_scores = _prev(scores) * sameteam + _prev(concedes) * (~sameteam)
|
47 |
+
|
48 |
+
# if the previous action was too long ago, the odds of scoring are now 0
|
49 |
+
# toolong_idx = (
|
50 |
+
# abs(actions.time_seconds - _prev(actions.time_seconds)) > _samephase_nb
|
51 |
+
# )
|
52 |
+
# prev_scores[toolong_idx] = 0
|
53 |
+
|
54 |
+
# if the previous action was a goal, the odds of scoring are now 0
|
55 |
+
prevgoal_idx = _prev(actions.type_name).isin(["goal", "owngoal"])
|
56 |
+
prev_scores[prevgoal_idx] = 0
|
57 |
+
|
58 |
+
return scores - prev_scores
|
59 |
+
|
60 |
+
|
61 |
+
def defensive_value(
|
62 |
+
actions: DataFrame[AtomicSPADLSchema], scores: Series[float], concedes: Series[float]
|
63 |
+
) -> Series[float]:
|
64 |
+
r"""Compute the defensive value of each action.
|
65 |
+
|
66 |
+
VAEP defines the *defensive value* of an action as the change in conceding
|
67 |
+
probability.
|
68 |
+
|
69 |
+
.. math::
|
70 |
+
|
71 |
+
\Delta P_{concede}(a_{i}, t) = P^{k}_{concede}(S_i, t) - P^{k}_{concede}(S_{i-1}, t)
|
72 |
+
|
73 |
+
where :math:`P_{concede}(S_i, t)` is the probability that team :math:`t`
|
74 |
+
which possesses the ball in state :math:`S_i` will concede in the next 10
|
75 |
+
actions.
|
76 |
+
|
77 |
+
Parameters
|
78 |
+
----------
|
79 |
+
actions : pd.DataFrame
|
80 |
+
SPADL action.
|
81 |
+
scores : pd.Series
|
82 |
+
The probability of scoring from each corresponding game state.
|
83 |
+
concedes : pd.Series
|
84 |
+
The probability of conceding from each corresponding game state.
|
85 |
+
|
86 |
+
Returns
|
87 |
+
-------
|
88 |
+
pd.Series
|
89 |
+
The defensive value of each action.
|
90 |
+
"""
|
91 |
+
sameteam = _prev(actions.team_id) == actions.team_id
|
92 |
+
prev_concedes = _prev(concedes) * sameteam + _prev(scores) * (~sameteam)
|
93 |
+
|
94 |
+
# if the previous action was too long ago, the odds of scoring are now 0
|
95 |
+
# toolong_idx = (
|
96 |
+
# abs(actions.time_seconds - _prev(actions.time_seconds)) > _samephase_nb
|
97 |
+
# )
|
98 |
+
# prev_concedes[toolong_idx] = 0
|
99 |
+
|
100 |
+
# if the previous action was a goal, the odds of conceding are now 0
|
101 |
+
prevgoal_idx = _prev(actions.type_name).isin(["goal", "owngoal"])
|
102 |
+
prev_concedes[prevgoal_idx] = 0
|
103 |
+
|
104 |
+
return -(concedes - prev_concedes)
|
105 |
+
|
106 |
+
|
107 |
+
def value(
|
108 |
+
actions: DataFrame[AtomicSPADLSchema], Pscores: Series[float], Pconcedes: Series[float]
|
109 |
+
) -> pd.DataFrame:
|
110 |
+
r"""Compute the offensive, defensive and VAEP value of each action.
|
111 |
+
|
112 |
+
The total VAEP value of an action is the difference between that action's
|
113 |
+
offensive value and defensive value.
|
114 |
+
|
115 |
+
.. math::
|
116 |
+
|
117 |
+
V_{VAEP}(a_i) = \Delta P_{score}(a_{i}, t) - \Delta P_{concede}(a_{i}, t)
|
118 |
+
|
119 |
+
Parameters
|
120 |
+
----------
|
121 |
+
actions : pd.DataFrame
|
122 |
+
SPADL action.
|
123 |
+
Pscores : pd.Series
|
124 |
+
The probability of scoring from each corresponding game state.
|
125 |
+
Pconcedes : pd.Series
|
126 |
+
The probability of conceding from each corresponding game state.
|
127 |
+
|
128 |
+
Returns
|
129 |
+
-------
|
130 |
+
pd.DataFrame
|
131 |
+
The 'offensive_value', 'defensive_value' and 'vaep_value' of each action.
|
132 |
+
|
133 |
+
See Also
|
134 |
+
--------
|
135 |
+
:func:`~socceraction.vaep.formula.offensive_value`: The offensive value
|
136 |
+
:func:`~socceraction.vaep.formula.defensive_value`: The defensive value
|
137 |
+
"""
|
138 |
+
v = pd.DataFrame()
|
139 |
+
v["offensive_value"] = offensive_value(actions, Pscores, Pconcedes)
|
140 |
+
v["defensive_value"] = defensive_value(actions, Pscores, Pconcedes)
|
141 |
+
v["vaep_value"] = v["offensive_value"] + v["defensive_value"]
|
142 |
+
return v
|
atomic/vaep/labels.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements the label tranformers of the Atomic-VAEP framework."""
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from pandera.typing import DataFrame
|
5 |
+
|
6 |
+
import socceraction.atomic.spadl.config as atomicspadl
|
7 |
+
from socceraction.atomic.spadl import AtomicSPADLSchema
|
8 |
+
|
9 |
+
|
10 |
+
def scores(actions: DataFrame[AtomicSPADLSchema], nr_actions: int = 10) -> pd.DataFrame:
|
11 |
+
"""Determine whether the team possessing the ball scored a goal within the next x actions.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
actions : pd.DataFrame
|
16 |
+
The actions of a game.
|
17 |
+
nr_actions : int, default=10 # noqa: DAR103
|
18 |
+
Number of actions after the current action to consider.
|
19 |
+
|
20 |
+
Returns
|
21 |
+
-------
|
22 |
+
pd.DataFrame
|
23 |
+
A dataframe with a column 'scores' and a row for each action set to
|
24 |
+
True if a goal was scored by the team possessing the ball within the
|
25 |
+
next x actions; otherwise False.
|
26 |
+
"""
|
27 |
+
# merging goals, owngoals and team_ids
|
28 |
+
goals = actions["type_id"] == atomicspadl.actiontypes.index("goal")
|
29 |
+
owngoals = actions["type_id"] == atomicspadl.actiontypes.index("owngoal")
|
30 |
+
y = pd.concat([goals, owngoals, actions["team_id"]], axis=1)
|
31 |
+
y.columns = ["goal", "owngoal", "team_id"]
|
32 |
+
|
33 |
+
# adding future results
|
34 |
+
for i in range(1, nr_actions):
|
35 |
+
for c in ["team_id", "goal", "owngoal"]:
|
36 |
+
shifted = y[c].shift(-i)
|
37 |
+
shifted[-i:] = y[c].iloc[len(y) - 1]
|
38 |
+
y["%s+%d" % (c, i)] = shifted
|
39 |
+
|
40 |
+
res = y["goal"]
|
41 |
+
for i in range(1, nr_actions):
|
42 |
+
gi = y["goal+%d" % i] & (y["team_id+%d" % i] == y["team_id"])
|
43 |
+
ogi = y["owngoal+%d" % i] & (y["team_id+%d" % i] != y["team_id"])
|
44 |
+
res = res | gi | ogi
|
45 |
+
|
46 |
+
return pd.DataFrame(res, columns=["scores"])
|
47 |
+
|
48 |
+
|
49 |
+
def concedes(actions: DataFrame[AtomicSPADLSchema], nr_actions: int = 10) -> pd.DataFrame:
|
50 |
+
"""Determine whether the team possessing the ball conceded a goal within the next x actions.
|
51 |
+
|
52 |
+
Parameters
|
53 |
+
----------
|
54 |
+
actions : pd.DataFrame
|
55 |
+
The actions of a game.
|
56 |
+
nr_actions : int, default=10 # noqa: DAR103
|
57 |
+
Number of actions after the current action to consider.
|
58 |
+
|
59 |
+
Returns
|
60 |
+
-------
|
61 |
+
pd.DataFrame
|
62 |
+
A dataframe with a column 'concedes' and a row for each action set to
|
63 |
+
True if a goal was conceded by the team possessing the ball within the
|
64 |
+
next x actions; otherwise False.
|
65 |
+
"""
|
66 |
+
# merging goals, owngoals and team_ids
|
67 |
+
goals = actions["type_id"] == atomicspadl.actiontypes.index("goal")
|
68 |
+
owngoals = actions["type_id"] == atomicspadl.actiontypes.index("owngoal")
|
69 |
+
y = pd.concat([goals, owngoals, actions["team_id"]], axis=1)
|
70 |
+
y.columns = ["goal", "owngoal", "team_id"]
|
71 |
+
|
72 |
+
# adding future results
|
73 |
+
for i in range(1, nr_actions):
|
74 |
+
for c in ["team_id", "goal", "owngoal"]:
|
75 |
+
shifted = y[c].shift(-i)
|
76 |
+
shifted[-i:] = y[c].iloc[len(y) - 1]
|
77 |
+
y["%s+%d" % (c, i)] = shifted
|
78 |
+
|
79 |
+
res = y["owngoal"]
|
80 |
+
for i in range(1, nr_actions):
|
81 |
+
gi = y["goal+%d" % i] & (y["team_id+%d" % i] != y["team_id"])
|
82 |
+
ogi = y["owngoal+%d" % i] & (y["team_id+%d" % i] == y["team_id"])
|
83 |
+
res = res | gi | ogi
|
84 |
+
|
85 |
+
return pd.DataFrame(res, columns=["concedes"])
|
86 |
+
|
87 |
+
|
88 |
+
def goal_from_shot(actions: DataFrame[AtomicSPADLSchema]) -> pd.DataFrame:
|
89 |
+
"""Determine whether a goal was scored from the current action.
|
90 |
+
|
91 |
+
This label can be use to train an xG model.
|
92 |
+
|
93 |
+
Parameters
|
94 |
+
----------
|
95 |
+
actions : pd.DataFrame
|
96 |
+
The actions of a game.
|
97 |
+
|
98 |
+
Returns
|
99 |
+
-------
|
100 |
+
pd.DataFrame
|
101 |
+
A dataframe with a column 'goal' and a row for each action set to
|
102 |
+
True if a goal was scored from the current action; otherwise False.
|
103 |
+
"""
|
104 |
+
goals = (actions["type_id"] == atomicspadl.actiontypes.index("shot")) & (
|
105 |
+
actions["type_id"].shift(-1) == atomicspadl.actiontypes.index("goal")
|
106 |
+
)
|
107 |
+
|
108 |
+
return pd.DataFrame(goals.rename("goal"))
|
data/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements serializers for the event data of various providers."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"opta",
|
5 |
+
"statsbomb",
|
6 |
+
"wyscout",
|
7 |
+
]
|
8 |
+
|
9 |
+
from . import opta, statsbomb, wyscout
|
data/base.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Base class and utility functions for all event stream data serializers.
|
2 |
+
|
3 |
+
A serializer should extend the 'EventDataLoader' class to (down)load event
|
4 |
+
stream data.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import base64
|
8 |
+
import json
|
9 |
+
import warnings
|
10 |
+
from abc import ABC, abstractmethod
|
11 |
+
from typing import Any, Union
|
12 |
+
from urllib import request
|
13 |
+
|
14 |
+
from pandera.typing import DataFrame
|
15 |
+
|
16 |
+
JSONType = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
|
17 |
+
|
18 |
+
|
19 |
+
class ParseError(Exception):
|
20 |
+
"""Exception raised when a file is not correctly formatted."""
|
21 |
+
|
22 |
+
|
23 |
+
class MissingDataError(Exception):
|
24 |
+
"""Exception raised when a field is missing in the input data."""
|
25 |
+
|
26 |
+
|
27 |
+
class NoAuthWarning(UserWarning):
|
28 |
+
"""Warning raised when no user credentials are provided."""
|
29 |
+
|
30 |
+
|
31 |
+
def _remoteloadjson(path: str) -> JSONType:
|
32 |
+
"""Load JSON data from a URL.
|
33 |
+
|
34 |
+
Parameters
|
35 |
+
----------
|
36 |
+
path : str
|
37 |
+
URL of the data source.
|
38 |
+
|
39 |
+
Returns
|
40 |
+
-------
|
41 |
+
JSONType
|
42 |
+
A dictionary with the loaded JSON data.
|
43 |
+
"""
|
44 |
+
return json.loads(request.urlopen(path).read())
|
45 |
+
|
46 |
+
|
47 |
+
def _auth_remoteloadjson(user: str, passwd: str) -> None:
|
48 |
+
"""Add a Authorization header to all requests.
|
49 |
+
|
50 |
+
Parameters
|
51 |
+
----------
|
52 |
+
user : str
|
53 |
+
Username.
|
54 |
+
passwd : str
|
55 |
+
Password.
|
56 |
+
"""
|
57 |
+
auth = base64.b64encode(f"{user}:{passwd}".encode())
|
58 |
+
opener = request.build_opener()
|
59 |
+
opener.addheaders = [("Authorization", f"Basic {auth.decode()}")]
|
60 |
+
request.install_opener(opener)
|
61 |
+
|
62 |
+
|
63 |
+
def _localloadjson(path: str) -> JSONType:
|
64 |
+
"""Load a dictionary from a JSON's filepath.
|
65 |
+
|
66 |
+
Parameters
|
67 |
+
----------
|
68 |
+
path : str
|
69 |
+
JSON's filepath.
|
70 |
+
|
71 |
+
Returns
|
72 |
+
-------
|
73 |
+
JSONType
|
74 |
+
A dictionary with the data loaded.
|
75 |
+
"""
|
76 |
+
with open(path, encoding="utf-8") as fh:
|
77 |
+
return json.load(fh)
|
78 |
+
|
79 |
+
|
80 |
+
def _has_auth(creds: dict[str, str]) -> bool:
|
81 |
+
"""Check if user credentials are provided.
|
82 |
+
|
83 |
+
Parameters
|
84 |
+
----------
|
85 |
+
creds : dict
|
86 |
+
A dictionary with user credentials. It should contain "user" and
|
87 |
+
"passwd" keys.
|
88 |
+
|
89 |
+
Returns
|
90 |
+
-------
|
91 |
+
bool
|
92 |
+
True if user credentials are provided, False otherwise.
|
93 |
+
"""
|
94 |
+
if creds.get("user") in [None, ""] or creds.get("passwd") in [None, ""]:
|
95 |
+
warnings.warn("Credentials were not supplied. Public data access only.", NoAuthWarning)
|
96 |
+
return False
|
97 |
+
return True
|
98 |
+
|
99 |
+
|
100 |
+
def _expand_minute(minute: int, periods_duration: list[int]) -> int:
|
101 |
+
"""Expand a timestamp with injury time of previous periods.
|
102 |
+
|
103 |
+
Parameters
|
104 |
+
----------
|
105 |
+
minute : int
|
106 |
+
Timestamp in minutes.
|
107 |
+
periods_duration : List[int]
|
108 |
+
Total duration of each period in minutes.
|
109 |
+
|
110 |
+
Returns
|
111 |
+
-------
|
112 |
+
int
|
113 |
+
Timestamp expanded with injury time.
|
114 |
+
"""
|
115 |
+
expanded_minute = minute
|
116 |
+
periods_regular = [45, 45, 15, 15, 0]
|
117 |
+
for period in range(len(periods_duration) - 1):
|
118 |
+
if minute > sum(periods_regular[: period + 1]):
|
119 |
+
expanded_minute += periods_duration[period] - periods_regular[period]
|
120 |
+
else:
|
121 |
+
break
|
122 |
+
return expanded_minute
|
123 |
+
|
124 |
+
|
125 |
+
class EventDataLoader(ABC):
|
126 |
+
"""Load event data either from a remote location or from a local folder.
|
127 |
+
|
128 |
+
Parameters
|
129 |
+
----------
|
130 |
+
root : str
|
131 |
+
Root-path of the data.
|
132 |
+
getter : str
|
133 |
+
"remote" or "local"
|
134 |
+
"""
|
135 |
+
|
136 |
+
@abstractmethod
|
137 |
+
def competitions(self) -> DataFrame[Any]:
|
138 |
+
"""Return a dataframe with all available competitions and seasons.
|
139 |
+
|
140 |
+
Returns
|
141 |
+
-------
|
142 |
+
pd.DataFrame
|
143 |
+
A dataframe containing all available competitions and seasons. See
|
144 |
+
:class:`~socceraction.spadl.base.CompetitionSchema` for the schema.
|
145 |
+
"""
|
146 |
+
|
147 |
+
@abstractmethod
|
148 |
+
def games(self, competition_id: int, season_id: int) -> DataFrame[Any]:
|
149 |
+
"""Return a dataframe with all available games in a season.
|
150 |
+
|
151 |
+
Parameters
|
152 |
+
----------
|
153 |
+
competition_id : int
|
154 |
+
The ID of the competition.
|
155 |
+
season_id : int
|
156 |
+
The ID of the season.
|
157 |
+
|
158 |
+
Returns
|
159 |
+
-------
|
160 |
+
pd.DataFrame
|
161 |
+
A dataframe containing all available games. See
|
162 |
+
:class:`~socceraction.spadl.base.GameSchema` for the schema.
|
163 |
+
"""
|
164 |
+
|
165 |
+
@abstractmethod
|
166 |
+
def teams(self, game_id: int) -> DataFrame[Any]:
|
167 |
+
"""Return a dataframe with both teams that participated in a game.
|
168 |
+
|
169 |
+
Parameters
|
170 |
+
----------
|
171 |
+
game_id : int
|
172 |
+
The ID of the game.
|
173 |
+
|
174 |
+
Returns
|
175 |
+
-------
|
176 |
+
pd.DataFrame
|
177 |
+
A dataframe containing both teams. See
|
178 |
+
:class:`~socceraction.spadl.base.TeamSchema` for the schema.
|
179 |
+
"""
|
180 |
+
|
181 |
+
@abstractmethod
|
182 |
+
def players(self, game_id: int) -> DataFrame[Any]:
|
183 |
+
"""Return a dataframe with all players that participated in a game.
|
184 |
+
|
185 |
+
Parameters
|
186 |
+
----------
|
187 |
+
game_id : int
|
188 |
+
The ID of the game.
|
189 |
+
|
190 |
+
Returns
|
191 |
+
-------
|
192 |
+
pd.DataFrame
|
193 |
+
A dataframe containing all players. See
|
194 |
+
:class:`~socceraction.spadl.base.PlayerSchema` for the schema.
|
195 |
+
"""
|
196 |
+
|
197 |
+
@abstractmethod
|
198 |
+
def events(self, game_id: int) -> DataFrame[Any]:
|
199 |
+
"""Return a dataframe with the event stream of a game.
|
200 |
+
|
201 |
+
Parameters
|
202 |
+
----------
|
203 |
+
game_id : int
|
204 |
+
The ID of the game.
|
205 |
+
|
206 |
+
Returns
|
207 |
+
-------
|
208 |
+
pd.DataFrame
|
209 |
+
A dataframe containing the event stream. See
|
210 |
+
:class:`~socceraction.spadl.base.EventSchema` for the schema.
|
211 |
+
"""
|
data/opta/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for loading Opta event data."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"OptaLoader",
|
5 |
+
"OptaCompetitionSchema",
|
6 |
+
"OptaGameSchema",
|
7 |
+
"OptaPlayerSchema",
|
8 |
+
"OptaTeamSchema",
|
9 |
+
"OptaEventSchema",
|
10 |
+
]
|
11 |
+
|
12 |
+
from .loader import OptaLoader
|
13 |
+
from .schema import (
|
14 |
+
OptaCompetitionSchema,
|
15 |
+
OptaEventSchema,
|
16 |
+
OptaGameSchema,
|
17 |
+
OptaPlayerSchema,
|
18 |
+
OptaTeamSchema,
|
19 |
+
)
|
data/opta/loader.py
ADDED
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements serializers for Opta data."""
|
2 |
+
|
3 |
+
import copy
|
4 |
+
import datetime
|
5 |
+
import glob
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import warnings
|
9 |
+
from collections.abc import Mapping
|
10 |
+
from pathlib import Path
|
11 |
+
from typing import Any, Optional, Union, cast
|
12 |
+
|
13 |
+
import pandas as pd # type: ignore
|
14 |
+
from pandera.typing import DataFrame
|
15 |
+
|
16 |
+
from socceraction.data.base import EventDataLoader
|
17 |
+
|
18 |
+
from .parsers import (
|
19 |
+
F1JSONParser,
|
20 |
+
F7XMLParser,
|
21 |
+
F9JSONParser,
|
22 |
+
F24JSONParser,
|
23 |
+
F24XMLParser,
|
24 |
+
MA1JSONParser,
|
25 |
+
MA3JSONParser,
|
26 |
+
OptaParser,
|
27 |
+
WhoScoredParser,
|
28 |
+
)
|
29 |
+
from .schema import (
|
30 |
+
OptaCompetitionSchema,
|
31 |
+
OptaEventSchema,
|
32 |
+
OptaGameSchema,
|
33 |
+
OptaPlayerSchema,
|
34 |
+
OptaTeamSchema,
|
35 |
+
)
|
36 |
+
|
37 |
+
_jsonparsers = {
|
38 |
+
"f1": F1JSONParser,
|
39 |
+
"f9": F9JSONParser,
|
40 |
+
"f24": F24JSONParser,
|
41 |
+
"ma1": MA1JSONParser,
|
42 |
+
"ma3": MA3JSONParser,
|
43 |
+
}
|
44 |
+
|
45 |
+
_xmlparsers = {
|
46 |
+
"f7": F7XMLParser,
|
47 |
+
"f24": F24XMLParser,
|
48 |
+
}
|
49 |
+
|
50 |
+
_statsperformparsers = {
|
51 |
+
"ma1": MA1JSONParser,
|
52 |
+
"ma3": MA3JSONParser,
|
53 |
+
}
|
54 |
+
|
55 |
+
_whoscoredparsers = {
|
56 |
+
"whoscored": WhoScoredParser,
|
57 |
+
}
|
58 |
+
|
59 |
+
_eventtypesdf = pd.DataFrame(
|
60 |
+
[
|
61 |
+
(1, "pass"),
|
62 |
+
(2, "offside pass"),
|
63 |
+
(3, "take on"),
|
64 |
+
(4, "foul"),
|
65 |
+
(5, "out"),
|
66 |
+
(6, "corner awarded"),
|
67 |
+
(7, "tackle"),
|
68 |
+
(8, "interception"),
|
69 |
+
(9, "turnover"),
|
70 |
+
(10, "save"),
|
71 |
+
(11, "claim"),
|
72 |
+
(12, "clearance"),
|
73 |
+
(13, "miss"),
|
74 |
+
(14, "post"),
|
75 |
+
(15, "attempt saved"),
|
76 |
+
(16, "goal"),
|
77 |
+
(17, "card"),
|
78 |
+
(18, "player off"),
|
79 |
+
(19, "player on"),
|
80 |
+
(20, "player retired"),
|
81 |
+
(21, "player returns"),
|
82 |
+
(22, "player becomes goalkeeper"),
|
83 |
+
(23, "goalkeeper becomes player"),
|
84 |
+
(24, "condition change"),
|
85 |
+
(25, "official change"),
|
86 |
+
(26, "unknown26"),
|
87 |
+
(27, "start delay"),
|
88 |
+
(28, "end delay"),
|
89 |
+
(29, "unknown29"),
|
90 |
+
(30, "end"),
|
91 |
+
(31, "unknown31"),
|
92 |
+
(32, "start"),
|
93 |
+
(33, "unknown33"),
|
94 |
+
(34, "team set up"),
|
95 |
+
(35, "player changed position"),
|
96 |
+
(36, "player changed jersey number"),
|
97 |
+
(37, "collection end"),
|
98 |
+
(38, "temp_goal"),
|
99 |
+
(39, "temp_attempt"),
|
100 |
+
(40, "formation change"),
|
101 |
+
(41, "punch"),
|
102 |
+
(42, "good skill"),
|
103 |
+
(43, "deleted event"),
|
104 |
+
(44, "aerial"),
|
105 |
+
(45, "challenge"),
|
106 |
+
(46, "unknown46"),
|
107 |
+
(47, "rescinded card"),
|
108 |
+
(48, "unknown46"),
|
109 |
+
(49, "ball recovery"),
|
110 |
+
(50, "dispossessed"),
|
111 |
+
(51, "error"),
|
112 |
+
(52, "keeper pick-up"),
|
113 |
+
(53, "cross not claimed"),
|
114 |
+
(54, "smother"),
|
115 |
+
(55, "offside provoked"),
|
116 |
+
(56, "shield ball opp"),
|
117 |
+
(57, "foul throw in"),
|
118 |
+
(58, "penalty faced"),
|
119 |
+
(59, "keeper sweeper"),
|
120 |
+
(60, "chance missed"),
|
121 |
+
(61, "ball touch"),
|
122 |
+
(62, "unknown62"),
|
123 |
+
(63, "temp_save"),
|
124 |
+
(64, "resume"),
|
125 |
+
(65, "contentious referee decision"),
|
126 |
+
(66, "possession data"),
|
127 |
+
(67, "50/50"),
|
128 |
+
(68, "referee drop ball"),
|
129 |
+
(69, "failed to block"),
|
130 |
+
(70, "injury time announcement"),
|
131 |
+
(71, "coach setup"),
|
132 |
+
(72, "caught offside"),
|
133 |
+
(73, "other ball contact"),
|
134 |
+
(74, "blocked pass"),
|
135 |
+
(75, "delayed start"),
|
136 |
+
(76, "early end"),
|
137 |
+
(77, "player off pitch"),
|
138 |
+
(78, "temp card"),
|
139 |
+
(79, "coverage interruption"),
|
140 |
+
(80, "drop of ball"),
|
141 |
+
(81, "obstacle"),
|
142 |
+
(83, "attempted tackle"),
|
143 |
+
(84, "deleted after review"),
|
144 |
+
(10000, "offside given"), # Seems specific to WhoScored
|
145 |
+
],
|
146 |
+
columns=["type_id", "type_name"],
|
147 |
+
)
|
148 |
+
|
149 |
+
|
150 |
+
def _deepupdate(target: dict[Any, Any], src: dict[Any, Any]) -> None:
|
151 |
+
"""Deep update target dict with src.
|
152 |
+
|
153 |
+
For each k,v in src: if k doesn't exist in target, it is deep copied from
|
154 |
+
src to target. Otherwise, if v is a list, target[k] is extended with
|
155 |
+
src[k]. If v is a set, target[k] is updated with v, If v is a dict,
|
156 |
+
recursively deep-update it.
|
157 |
+
|
158 |
+
Parameters
|
159 |
+
----------
|
160 |
+
target: dict
|
161 |
+
The original dictionary which is updated.
|
162 |
+
src: dict
|
163 |
+
The dictionary with which `target` is updated.
|
164 |
+
|
165 |
+
Examples
|
166 |
+
--------
|
167 |
+
>>> t = {'name': 'ferry', 'hobbies': ['programming', 'sci-fi']}
|
168 |
+
>>> deepupdate(t, {'hobbies': ['gaming']})
|
169 |
+
>>> print(t)
|
170 |
+
{'name': 'ferry', 'hobbies': ['programming', 'sci-fi', 'gaming']}
|
171 |
+
"""
|
172 |
+
for k, v in src.items():
|
173 |
+
if isinstance(v, list):
|
174 |
+
if k not in target:
|
175 |
+
target[k] = copy.deepcopy(v)
|
176 |
+
else:
|
177 |
+
target[k].extend(v)
|
178 |
+
elif isinstance(v, dict):
|
179 |
+
if k not in target:
|
180 |
+
target[k] = copy.deepcopy(v)
|
181 |
+
else:
|
182 |
+
_deepupdate(target[k], v)
|
183 |
+
elif isinstance(v, set):
|
184 |
+
if k not in target:
|
185 |
+
target[k] = v.copy()
|
186 |
+
else:
|
187 |
+
target[k].update(v.copy())
|
188 |
+
else:
|
189 |
+
target[k] = copy.copy(v)
|
190 |
+
|
191 |
+
|
192 |
+
def _extract_ids_from_path(path: str, pattern: str) -> dict[str, Union[str, int]]:
|
193 |
+
regex = re.compile(
|
194 |
+
".+?"
|
195 |
+
+ re.escape(pattern)
|
196 |
+
.replace(r"\{competition_id\}", r"(?P<competition_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
|
197 |
+
.replace(r"\{season_id\}", r"(?P<season_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
|
198 |
+
.replace(r"\{game_id\}", r"(?P<game_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
|
199 |
+
)
|
200 |
+
m = re.match(regex, path)
|
201 |
+
if m is None:
|
202 |
+
raise ValueError(f"The filepath {path} does not match the format {pattern}.")
|
203 |
+
ids = m.groupdict()
|
204 |
+
return {k: int(v) if v.isdigit() else v for k, v in ids.items()}
|
205 |
+
|
206 |
+
|
207 |
+
class OptaLoader(EventDataLoader):
|
208 |
+
"""Load Opta data feeds from a local folder.
|
209 |
+
|
210 |
+
Parameters
|
211 |
+
----------
|
212 |
+
root : str
|
213 |
+
Root-path of the data.
|
214 |
+
parser : str or dict
|
215 |
+
Either 'xml', 'json', 'statsperform', 'whoscored' or a dict with
|
216 |
+
a custom parser for each feed. The default xml parser supports F7 and
|
217 |
+
F24 feeds; the default json parser supports F1, F9 and F24 feeds, the
|
218 |
+
StatsPerform parser supports MA1 and MA3 feeds. Custom parsers can be
|
219 |
+
specified as::
|
220 |
+
|
221 |
+
{
|
222 |
+
'feed1_name': Feed1Parser
|
223 |
+
'feed2_name': Feed2Parser
|
224 |
+
}
|
225 |
+
|
226 |
+
where Feed1Parser and Feed2Parser are classes implementing
|
227 |
+
:class:`~socceraction.spadl.opta.OptaParser` and 'feed1_name' and
|
228 |
+
'feed2_name' are a unique ID for each feed that matches to the keys in
|
229 |
+
`feeds`.
|
230 |
+
feeds : dict
|
231 |
+
Glob pattern describing from which files the data from a specific game
|
232 |
+
can be retrieved. For example, if files are named::
|
233 |
+
|
234 |
+
f7-1-2021-17362.xml
|
235 |
+
f24-1-2021-17362.xml
|
236 |
+
|
237 |
+
use::
|
238 |
+
|
239 |
+
feeds = {
|
240 |
+
'f7': "f7-{competition_id}-{season_id}-{game_id}.xml",
|
241 |
+
'f24': "f24-{competition_id}-{season_id}-{game_id}.xml"
|
242 |
+
}
|
243 |
+
|
244 |
+
Raises
|
245 |
+
------
|
246 |
+
ValueError
|
247 |
+
If an invalid parser is provided.
|
248 |
+
"""
|
249 |
+
|
250 |
+
def __init__( # noqa: C901
|
251 |
+
self,
|
252 |
+
root: str,
|
253 |
+
parser: Union[str, Mapping[str, type[OptaParser]]] = "xml",
|
254 |
+
feeds: Optional[dict[str, str]] = None,
|
255 |
+
) -> None:
|
256 |
+
self.root = root
|
257 |
+
if parser == "json":
|
258 |
+
if feeds is None:
|
259 |
+
feeds = {
|
260 |
+
"f1": "f1-{competition_id}-{season_id}.json",
|
261 |
+
"f9": "f9-{competition_id}-{season_id}-{game_id}.json",
|
262 |
+
"f24": "f24-{competition_id}-{season_id}-{game_id}.json",
|
263 |
+
}
|
264 |
+
self.parsers = self._get_parsers_for_feeds(_jsonparsers, feeds)
|
265 |
+
elif parser == "xml":
|
266 |
+
if feeds is None:
|
267 |
+
feeds = {
|
268 |
+
"f7": "f7-{competition_id}-{season_id}-{game_id}.xml",
|
269 |
+
"f24": "f24-{competition_id}-{season_id}-{game_id}.xml",
|
270 |
+
}
|
271 |
+
self.parsers = self._get_parsers_for_feeds(_xmlparsers, feeds)
|
272 |
+
elif parser == "statsperform":
|
273 |
+
if feeds is None:
|
274 |
+
feeds = {
|
275 |
+
"ma1": "ma1-{competition_id}-{season_id}.json",
|
276 |
+
"ma3": "ma3-{competition_id}-{season_id}-{game_id}.json",
|
277 |
+
}
|
278 |
+
self.parsers = self._get_parsers_for_feeds(_statsperformparsers, feeds)
|
279 |
+
elif parser == "whoscored":
|
280 |
+
if feeds is None:
|
281 |
+
feeds = {
|
282 |
+
"whoscored": "{competition_id}-{season_id}-{game_id}.json",
|
283 |
+
}
|
284 |
+
self.parsers = self._get_parsers_for_feeds(_whoscoredparsers, feeds)
|
285 |
+
elif isinstance(parser, dict):
|
286 |
+
if feeds is None:
|
287 |
+
raise ValueError("You must specify a feed for each parser.")
|
288 |
+
self.parsers = self._get_parsers_for_feeds(parser, feeds)
|
289 |
+
else:
|
290 |
+
raise ValueError("Invalid parser provided.")
|
291 |
+
self.feeds = {k: str(Path(v)) for k, v in feeds.items()}
|
292 |
+
|
293 |
+
def _get_parsers_for_feeds(
|
294 |
+
self, available_parsers: Mapping[str, type[OptaParser]], feeds: dict[str, str]
|
295 |
+
) -> Mapping[str, type[OptaParser]]:
|
296 |
+
"""Select the appropriate parser for each feed.
|
297 |
+
|
298 |
+
Parameters
|
299 |
+
----------
|
300 |
+
available_parsers : dict(str, OptaParser)
|
301 |
+
Dictionary with all available parsers.
|
302 |
+
feeds : dict(str, str)
|
303 |
+
All feeds that should be parsed.
|
304 |
+
|
305 |
+
Returns
|
306 |
+
-------
|
307 |
+
dict(str, OptaParser)
|
308 |
+
A mapping between all feeds that should be parsed and the
|
309 |
+
corresponding parser class.
|
310 |
+
|
311 |
+
Warns
|
312 |
+
-----
|
313 |
+
Raises a warning if there is no parser available for any of the
|
314 |
+
provided feeds.
|
315 |
+
"""
|
316 |
+
parsers = {}
|
317 |
+
for feed in feeds:
|
318 |
+
if feed in available_parsers:
|
319 |
+
parsers[feed] = available_parsers[feed]
|
320 |
+
else:
|
321 |
+
warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.")
|
322 |
+
return parsers
|
323 |
+
|
324 |
+
def competitions(self) -> DataFrame[OptaCompetitionSchema]:
|
325 |
+
"""Return a dataframe with all available competitions and seasons.
|
326 |
+
|
327 |
+
Returns
|
328 |
+
-------
|
329 |
+
pd.DataFrame
|
330 |
+
A dataframe containing all available competitions and seasons. See
|
331 |
+
:class:`~socceraction.spadl.opta.OptaCompetitionSchema` for the schema.
|
332 |
+
"""
|
333 |
+
data: dict[int, dict[str, Any]] = {}
|
334 |
+
loaded_seasons = set()
|
335 |
+
for feed, feed_pattern in self.feeds.items():
|
336 |
+
glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id="*")
|
337 |
+
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
|
338 |
+
for ffp in feed_files:
|
339 |
+
ids = _extract_ids_from_path(ffp, feed_pattern)
|
340 |
+
# For efficiency, we only parse one game for each season. This
|
341 |
+
# only works if both the competition and season are part of
|
342 |
+
# the file name.
|
343 |
+
competition_id = ids.get("competition_id")
|
344 |
+
season_id = ids.get("season_id")
|
345 |
+
if competition_id is not None and season_id is not None:
|
346 |
+
if (competition_id, season_id) in loaded_seasons:
|
347 |
+
continue
|
348 |
+
else:
|
349 |
+
loaded_seasons.add((competition_id, season_id))
|
350 |
+
parser = self.parsers[feed](ffp, **ids)
|
351 |
+
_deepupdate(data, parser.extract_competitions())
|
352 |
+
return cast(DataFrame[OptaCompetitionSchema], pd.DataFrame(list(data.values())))
|
353 |
+
|
354 |
+
def games(self, competition_id: int, season_id: int) -> DataFrame[OptaGameSchema]:
|
355 |
+
"""Return a dataframe with all available games in a season.
|
356 |
+
|
357 |
+
Parameters
|
358 |
+
----------
|
359 |
+
competition_id : int
|
360 |
+
The ID of the competition.
|
361 |
+
season_id : int
|
362 |
+
The ID of the season.
|
363 |
+
|
364 |
+
Returns
|
365 |
+
-------
|
366 |
+
pd.DataFrame
|
367 |
+
A dataframe containing all available games. See
|
368 |
+
:class:`~socceraction.spadl.opta.OptaGameSchema` for the schema.
|
369 |
+
"""
|
370 |
+
data: dict[int, dict[str, Any]] = {}
|
371 |
+
for feed, feed_pattern in self.feeds.items():
|
372 |
+
glob_pattern = feed_pattern.format(
|
373 |
+
competition_id=competition_id, season_id=season_id, game_id="*"
|
374 |
+
)
|
375 |
+
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
|
376 |
+
for ffp in feed_files:
|
377 |
+
ids = _extract_ids_from_path(ffp, feed_pattern)
|
378 |
+
parser = self.parsers[feed](ffp, **ids)
|
379 |
+
_deepupdate(data, parser.extract_games())
|
380 |
+
return cast(DataFrame[OptaGameSchema], pd.DataFrame(list(data.values())))
|
381 |
+
|
382 |
+
def teams(self, game_id: int) -> DataFrame[OptaTeamSchema]:
|
383 |
+
"""Return a dataframe with both teams that participated in a game.
|
384 |
+
|
385 |
+
Parameters
|
386 |
+
----------
|
387 |
+
game_id : int
|
388 |
+
The ID of the game.
|
389 |
+
|
390 |
+
Returns
|
391 |
+
-------
|
392 |
+
pd.DataFrame
|
393 |
+
A dataframe containing both teams. See
|
394 |
+
:class:`~socceraction.spadl.opta.OptaTeamSchema` for the schema.
|
395 |
+
"""
|
396 |
+
data: dict[int, dict[str, Any]] = {}
|
397 |
+
for feed, feed_pattern in self.feeds.items():
|
398 |
+
glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
|
399 |
+
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
|
400 |
+
for ffp in feed_files:
|
401 |
+
ids = _extract_ids_from_path(ffp, feed_pattern)
|
402 |
+
parser = self.parsers[feed](ffp, **ids)
|
403 |
+
_deepupdate(data, parser.extract_teams())
|
404 |
+
return cast(DataFrame[OptaTeamSchema], pd.DataFrame(list(data.values())))
|
405 |
+
|
406 |
+
def players(self, game_id: int) -> DataFrame[OptaPlayerSchema]:
|
407 |
+
"""Return a dataframe with all players that participated in a game.
|
408 |
+
|
409 |
+
Parameters
|
410 |
+
----------
|
411 |
+
game_id : int
|
412 |
+
The ID of the game.
|
413 |
+
|
414 |
+
Returns
|
415 |
+
-------
|
416 |
+
pd.DataFrame
|
417 |
+
A dataframe containing all players. See
|
418 |
+
:class:`~socceraction.spadl.opta.OptaPlayerSchema` for the schema.
|
419 |
+
"""
|
420 |
+
data: dict[int, dict[str, Any]] = {}
|
421 |
+
for feed, feed_pattern in self.feeds.items():
|
422 |
+
glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
|
423 |
+
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
|
424 |
+
for ffp in feed_files:
|
425 |
+
ids = _extract_ids_from_path(ffp, feed_pattern)
|
426 |
+
parser = self.parsers[feed](ffp, **ids)
|
427 |
+
_deepupdate(data, parser.extract_players())
|
428 |
+
df_players = pd.DataFrame(list(data.values()))
|
429 |
+
df_players["game_id"] = game_id
|
430 |
+
return cast(DataFrame[OptaPlayerSchema], df_players)
|
431 |
+
|
432 |
+
def events(self, game_id: int) -> DataFrame[OptaEventSchema]:
|
433 |
+
"""Return a dataframe with the event stream of a game.
|
434 |
+
|
435 |
+
Parameters
|
436 |
+
----------
|
437 |
+
game_id : int
|
438 |
+
The ID of the game.
|
439 |
+
|
440 |
+
Returns
|
441 |
+
-------
|
442 |
+
pd.DataFrame
|
443 |
+
A dataframe containing the event stream. See
|
444 |
+
:class:`~socceraction.spadl.opta.OptaEventSchema` for the schema.
|
445 |
+
"""
|
446 |
+
data: dict[int, dict[str, Any]] = {}
|
447 |
+
for feed, feed_pattern in self.feeds.items():
|
448 |
+
glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
|
449 |
+
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
|
450 |
+
for ffp in feed_files:
|
451 |
+
ids = _extract_ids_from_path(ffp, feed_pattern)
|
452 |
+
parser = self.parsers[feed](ffp, **ids)
|
453 |
+
_deepupdate(data, parser.extract_events())
|
454 |
+
events = (
|
455 |
+
pd.DataFrame(list(data.values()))
|
456 |
+
.merge(_eventtypesdf, on="type_id", how="left")
|
457 |
+
.sort_values(
|
458 |
+
["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort"
|
459 |
+
)
|
460 |
+
.reset_index(drop=True)
|
461 |
+
)
|
462 |
+
|
463 |
+
# sometimes pre-match events has -3, -2 and -1 seconds
|
464 |
+
events.loc[events.second < 0, "second"] = 0
|
465 |
+
events = events.sort_values(
|
466 |
+
["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort"
|
467 |
+
)
|
468 |
+
|
469 |
+
# deleted events has wrong datetime which occurs OutOfBoundsDatetime error
|
470 |
+
events = events[events.type_id != 43]
|
471 |
+
events = events[
|
472 |
+
~(
|
473 |
+
(events.timestamp < datetime.datetime(1900, 1, 1))
|
474 |
+
| (events.timestamp > datetime.datetime(2100, 1, 1))
|
475 |
+
)
|
476 |
+
]
|
477 |
+
|
478 |
+
return cast(DataFrame[OptaEventSchema], events)
|
data/opta/parsers/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Parsers for Opta(-derived) event streams."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"OptaParser",
|
5 |
+
"F1JSONParser",
|
6 |
+
"F9JSONParser",
|
7 |
+
"F24JSONParser",
|
8 |
+
"F7XMLParser",
|
9 |
+
"F24XMLParser",
|
10 |
+
"MA1JSONParser",
|
11 |
+
"MA3JSONParser",
|
12 |
+
"WhoScoredParser",
|
13 |
+
]
|
14 |
+
|
15 |
+
from .base import OptaParser
|
16 |
+
from .f1_json import F1JSONParser
|
17 |
+
from .f7_xml import F7XMLParser
|
18 |
+
from .f9_json import F9JSONParser
|
19 |
+
from .f24_json import F24JSONParser
|
20 |
+
from .f24_xml import F24XMLParser
|
21 |
+
from .ma1_json import MA1JSONParser
|
22 |
+
from .ma3_json import MA3JSONParser
|
23 |
+
from .whoscored import WhoScoredParser
|
data/opta/parsers/base.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Base class for all Opta(-derived) event stream parsers.
|
2 |
+
|
3 |
+
A parser reads a single data file and should extend the 'OptaParser' class to
|
4 |
+
extract data about competitions, games, players, teams and events that is
|
5 |
+
encoded in the file.
|
6 |
+
|
7 |
+
"""
|
8 |
+
|
9 |
+
import json # type: ignore
|
10 |
+
from typing import Any, Optional
|
11 |
+
|
12 |
+
from lxml import objectify
|
13 |
+
|
14 |
+
|
15 |
+
class OptaParser:
|
16 |
+
"""Extract data from an Opta data stream.
|
17 |
+
|
18 |
+
Parameters
|
19 |
+
----------
|
20 |
+
path : str
|
21 |
+
Path of the data file.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
|
25 |
+
raise NotImplementedError
|
26 |
+
|
27 |
+
def extract_competitions(self) -> dict[tuple[Any, Any], dict[str, Any]]:
|
28 |
+
"""Return a dictionary with all available competitions.
|
29 |
+
|
30 |
+
Returns
|
31 |
+
-------
|
32 |
+
dict
|
33 |
+
A mapping between (competion ID, season ID) tuples and the
|
34 |
+
information available about each competition in the data stream.
|
35 |
+
"""
|
36 |
+
return {}
|
37 |
+
|
38 |
+
def extract_games(self) -> dict[Any, dict[str, Any]]:
|
39 |
+
"""Return a dictionary with all available games.
|
40 |
+
|
41 |
+
Returns
|
42 |
+
-------
|
43 |
+
dict
|
44 |
+
A mapping between game IDs and the information available about
|
45 |
+
each game in the data stream.
|
46 |
+
"""
|
47 |
+
return {}
|
48 |
+
|
49 |
+
def extract_teams(self) -> dict[Any, dict[str, Any]]:
|
50 |
+
"""Return a dictionary with all available teams.
|
51 |
+
|
52 |
+
Returns
|
53 |
+
-------
|
54 |
+
dict
|
55 |
+
A mapping between team IDs and the information available about
|
56 |
+
each team in the data stream.
|
57 |
+
"""
|
58 |
+
return {}
|
59 |
+
|
60 |
+
def extract_players(self) -> dict[tuple[Any, Any], dict[str, Any]]:
|
61 |
+
"""Return a dictionary with all available players.
|
62 |
+
|
63 |
+
Returns
|
64 |
+
-------
|
65 |
+
dict
|
66 |
+
A mapping between (game ID, player ID) tuples and the information
|
67 |
+
available about each player in the data stream.
|
68 |
+
"""
|
69 |
+
return {}
|
70 |
+
|
71 |
+
def extract_lineups(self) -> dict[Any, dict[str, Any]]:
|
72 |
+
"""Return a dictionary with the lineup of each team.
|
73 |
+
|
74 |
+
Returns
|
75 |
+
-------
|
76 |
+
dict
|
77 |
+
A mapping between team IDs and the information available about
|
78 |
+
each team's lineup in the data stream.
|
79 |
+
"""
|
80 |
+
return {}
|
81 |
+
|
82 |
+
def extract_events(self) -> dict[tuple[Any, Any], dict[str, Any]]:
|
83 |
+
"""Return a dictionary with all available events.
|
84 |
+
|
85 |
+
Returns
|
86 |
+
-------
|
87 |
+
dict
|
88 |
+
A mapping between (game ID, event ID) tuples and the information
|
89 |
+
available about each event in the data stream.
|
90 |
+
"""
|
91 |
+
return {}
|
92 |
+
|
93 |
+
|
94 |
+
class OptaJSONParser(OptaParser):
|
95 |
+
"""Extract data from an Opta JSON data stream.
|
96 |
+
|
97 |
+
Parameters
|
98 |
+
----------
|
99 |
+
path : str
|
100 |
+
Path of the data file.
|
101 |
+
"""
|
102 |
+
|
103 |
+
def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
|
104 |
+
with open(path, encoding="utf-8") as fh:
|
105 |
+
self.root = json.load(fh)
|
106 |
+
|
107 |
+
|
108 |
+
class OptaXMLParser(OptaParser):
|
109 |
+
"""Extract data from an Opta XML data stream.
|
110 |
+
|
111 |
+
Parameters
|
112 |
+
----------
|
113 |
+
path : str
|
114 |
+
Path of the data file.
|
115 |
+
"""
|
116 |
+
|
117 |
+
def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
|
118 |
+
with open(path, "rb") as fh:
|
119 |
+
self.root = objectify.fromstring(fh.read())
|
120 |
+
|
121 |
+
|
122 |
+
def assertget(dictionary: dict[str, Any], key: str) -> Any: # noqa: ANN401
|
123 |
+
"""Return the value of the item with the specified key.
|
124 |
+
|
125 |
+
In contrast to the default `get` method, this version will raise an
|
126 |
+
assertion error if the given key is not present in the dict.
|
127 |
+
|
128 |
+
Parameters
|
129 |
+
----------
|
130 |
+
dictionary : dict
|
131 |
+
A Python dictionary.
|
132 |
+
key : str
|
133 |
+
A key in the dictionary.
|
134 |
+
|
135 |
+
Returns
|
136 |
+
-------
|
137 |
+
Any
|
138 |
+
Returns the value for the specified key if the key is in the dictionary.
|
139 |
+
|
140 |
+
Raises
|
141 |
+
------
|
142 |
+
AssertionError
|
143 |
+
If the given key could not be found in the dictionary.
|
144 |
+
"""
|
145 |
+
value = dictionary.get(key)
|
146 |
+
assert value is not None, "KeyError: " + key + " not found in " + str(dictionary)
|
147 |
+
return value
|
148 |
+
|
149 |
+
|
150 |
+
def _get_end_x(qualifiers: dict[int, Any]) -> Optional[float]:
|
151 |
+
try:
|
152 |
+
# pass
|
153 |
+
if 140 in qualifiers:
|
154 |
+
return float(qualifiers[140])
|
155 |
+
# blocked shot
|
156 |
+
if 146 in qualifiers:
|
157 |
+
return float(qualifiers[146])
|
158 |
+
# passed the goal line
|
159 |
+
if 102 in qualifiers:
|
160 |
+
return float(100)
|
161 |
+
return None
|
162 |
+
except ValueError:
|
163 |
+
return None
|
164 |
+
|
165 |
+
|
166 |
+
def _get_end_y(qualifiers: dict[int, Any]) -> Optional[float]:
|
167 |
+
try:
|
168 |
+
# pass
|
169 |
+
if 141 in qualifiers:
|
170 |
+
return float(qualifiers[141])
|
171 |
+
# blocked shot
|
172 |
+
if 147 in qualifiers:
|
173 |
+
return float(qualifiers[147])
|
174 |
+
# passed the goal line
|
175 |
+
if 102 in qualifiers:
|
176 |
+
return float(qualifiers[102])
|
177 |
+
return None
|
178 |
+
except ValueError:
|
179 |
+
return None
|
data/opta/parsers/f1_json.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for Opta F1 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
from ...base import MissingDataError
|
7 |
+
from .base import OptaJSONParser, assertget
|
8 |
+
|
9 |
+
|
10 |
+
class F1JSONParser(OptaJSONParser):
|
11 |
+
"""Extract data from a Opta F1 data stream.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
path : str
|
16 |
+
Path of the data file.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def _get_feed(self) -> dict[str, Any]:
|
20 |
+
for node in self.root:
|
21 |
+
if "OptaFeed" in node["data"].keys():
|
22 |
+
return node
|
23 |
+
raise MissingDataError
|
24 |
+
|
25 |
+
def _get_doc(self) -> dict[str, Any]:
|
26 |
+
f1 = self._get_feed()
|
27 |
+
data = assertget(f1, "data")
|
28 |
+
optafeed = assertget(data, "OptaFeed")
|
29 |
+
optadocument = assertget(optafeed, "OptaDocument")
|
30 |
+
return optadocument
|
31 |
+
|
32 |
+
def extract_competitions(self) -> dict[tuple[int, int], dict[str, Any]]:
|
33 |
+
"""Return a dictionary with all available competitions.
|
34 |
+
|
35 |
+
Returns
|
36 |
+
-------
|
37 |
+
dict
|
38 |
+
A mapping between (competion ID, season ID) tuples and the
|
39 |
+
information available about each competition in the data stream.
|
40 |
+
"""
|
41 |
+
optadocument = self._get_doc()
|
42 |
+
attr = assertget(optadocument, "@attributes")
|
43 |
+
competition_id = int(assertget(attr, "competition_id"))
|
44 |
+
season_id = int(assertget(attr, "season_id"))
|
45 |
+
competition = {
|
46 |
+
# Fields required by the base schema
|
47 |
+
"season_id": season_id,
|
48 |
+
"season_name": str(assertget(attr, "season_id")),
|
49 |
+
"competition_id": competition_id,
|
50 |
+
"competition_name": assertget(attr, "competition_name"),
|
51 |
+
}
|
52 |
+
return {(competition_id, season_id): competition}
|
53 |
+
|
54 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
55 |
+
"""Return a dictionary with all available games.
|
56 |
+
|
57 |
+
Returns
|
58 |
+
-------
|
59 |
+
dict
|
60 |
+
A mapping between game IDs and the information available about
|
61 |
+
each game in the data stream.
|
62 |
+
"""
|
63 |
+
optadocument = self._get_doc()
|
64 |
+
attr = assertget(optadocument, "@attributes")
|
65 |
+
matchdata = assertget(optadocument, "MatchData")
|
66 |
+
matches = {}
|
67 |
+
for match in matchdata:
|
68 |
+
matchattr = assertget(match, "@attributes")
|
69 |
+
matchinfo = assertget(match, "MatchInfo")
|
70 |
+
matchinfoattr = assertget(matchinfo, "@attributes")
|
71 |
+
game_id = int(assertget(matchattr, "uID")[1:])
|
72 |
+
matches[game_id] = {
|
73 |
+
# Fields required by the base schema
|
74 |
+
"game_id": game_id,
|
75 |
+
"competition_id": int(assertget(attr, "competition_id")),
|
76 |
+
"season_id": int(assertget(attr, "season_id")),
|
77 |
+
"game_day": int(assertget(matchinfoattr, "MatchDay")),
|
78 |
+
"game_date": datetime.strptime(assertget(matchinfo, "Date"), "%Y-%m-%d %H:%M:%S"),
|
79 |
+
# home_team_id=see below,
|
80 |
+
# away_team_id=see below,
|
81 |
+
# Optional fields
|
82 |
+
# home_score=see below,
|
83 |
+
# away_score=see below,
|
84 |
+
# duration=?
|
85 |
+
# referee=?
|
86 |
+
# venue=?,
|
87 |
+
# attendance=?
|
88 |
+
# home_manager=?
|
89 |
+
# away_manager=?
|
90 |
+
}
|
91 |
+
teamdata = assertget(match, "TeamData")
|
92 |
+
for team in teamdata:
|
93 |
+
teamattr = assertget(team, "@attributes")
|
94 |
+
side = assertget(teamattr, "Side")
|
95 |
+
teamid = assertget(teamattr, "TeamRef")
|
96 |
+
score = assertget(teamattr, "Score")
|
97 |
+
if side == "Home":
|
98 |
+
matches[game_id]["home_team_id"] = int(teamid[1:])
|
99 |
+
matches[game_id]["home_score"] = int(score)
|
100 |
+
else:
|
101 |
+
matches[game_id]["away_team_id"] = int(teamid[1:])
|
102 |
+
matches[game_id]["away_score"] = int(score)
|
103 |
+
return matches
|
data/opta/parsers/f24_json.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for Opta F24 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
from ...base import MissingDataError
|
7 |
+
from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget
|
8 |
+
|
9 |
+
|
10 |
+
class F24JSONParser(OptaJSONParser):
|
11 |
+
"""Extract data from a Opta F24 data stream.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
path : str
|
16 |
+
Path of the data file.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def _get_doc(self) -> dict[str, Any]:
|
20 |
+
for node in self.root:
|
21 |
+
if "Games" in node["data"].keys():
|
22 |
+
return node
|
23 |
+
raise MissingDataError
|
24 |
+
|
25 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
26 |
+
"""Return a dictionary with all available games.
|
27 |
+
|
28 |
+
Returns
|
29 |
+
-------
|
30 |
+
dict
|
31 |
+
A mapping between game IDs and the information available about
|
32 |
+
each game in the data stream.
|
33 |
+
"""
|
34 |
+
f24 = self._get_doc()
|
35 |
+
|
36 |
+
data = assertget(f24, "data")
|
37 |
+
games = assertget(data, "Games")
|
38 |
+
game = assertget(games, "Game")
|
39 |
+
attr = assertget(game, "@attributes")
|
40 |
+
|
41 |
+
game_id = int(assertget(attr, "id"))
|
42 |
+
game_dict = {
|
43 |
+
game_id: {
|
44 |
+
# Fields required by the base schema
|
45 |
+
"game_id": game_id,
|
46 |
+
"season_id": int(assertget(attr, "season_id")),
|
47 |
+
"competition_id": int(assertget(attr, "competition_id")),
|
48 |
+
"game_day": int(assertget(attr, "matchday")),
|
49 |
+
"game_date": datetime.strptime(
|
50 |
+
assertget(assertget(attr, "game_date"), "locale"), "%Y-%m-%dT%H:%M:%S.%fZ"
|
51 |
+
).replace(tzinfo=None),
|
52 |
+
"home_team_id": int(assertget(attr, "home_team_id")),
|
53 |
+
"away_team_id": int(assertget(attr, "away_team_id")),
|
54 |
+
# Fields required by the opta schema
|
55 |
+
# home_score=?
|
56 |
+
# away_score=?
|
57 |
+
# duration=?
|
58 |
+
# referee=?
|
59 |
+
# venue=?,
|
60 |
+
# attendance=?
|
61 |
+
# Optional fields
|
62 |
+
# home_manager=?
|
63 |
+
# away_manager=?
|
64 |
+
}
|
65 |
+
}
|
66 |
+
return game_dict
|
67 |
+
|
68 |
+
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
|
69 |
+
"""Return a dictionary with all available events.
|
70 |
+
|
71 |
+
Returns
|
72 |
+
-------
|
73 |
+
dict
|
74 |
+
A mapping between (game ID, event ID) tuples and the information
|
75 |
+
available about each event in the data stream.
|
76 |
+
"""
|
77 |
+
f24 = self._get_doc()
|
78 |
+
|
79 |
+
data = assertget(f24, "data")
|
80 |
+
games = assertget(data, "Games")
|
81 |
+
game = assertget(games, "Game")
|
82 |
+
game_attr = assertget(game, "@attributes")
|
83 |
+
game_id = int(assertget(game_attr, "id"))
|
84 |
+
|
85 |
+
events = {}
|
86 |
+
for element in assertget(game, "Event"):
|
87 |
+
attr = element["@attributes"]
|
88 |
+
timestamp = attr["TimeStamp"].get("locale") if attr.get("TimeStamp") else None
|
89 |
+
timestamp = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
|
90 |
+
qualifiers = {
|
91 |
+
int(q["@attributes"]["qualifier_id"]): q["@attributes"]["value"]
|
92 |
+
for q in element.get("Q", [])
|
93 |
+
}
|
94 |
+
start_x = float(assertget(attr, "x"))
|
95 |
+
start_y = float(assertget(attr, "y"))
|
96 |
+
end_x = _get_end_x(qualifiers)
|
97 |
+
end_y = _get_end_y(qualifiers)
|
98 |
+
|
99 |
+
event_id = int(assertget(attr, "id"))
|
100 |
+
events[(game_id, event_id)] = {
|
101 |
+
# Fields required by the base schema
|
102 |
+
"game_id": game_id,
|
103 |
+
"event_id": event_id,
|
104 |
+
"period_id": int(assertget(attr, "period_id")),
|
105 |
+
"team_id": int(assertget(attr, "team_id")),
|
106 |
+
"player_id": int(assertget(attr, "player_id")),
|
107 |
+
"type_id": int(assertget(attr, "type_id")),
|
108 |
+
# type_name=?, # added in the opta loader
|
109 |
+
# Fields required by the opta schema
|
110 |
+
"timestamp": timestamp,
|
111 |
+
"minute": int(assertget(attr, "min")),
|
112 |
+
"second": int(assertget(attr, "sec")),
|
113 |
+
"outcome": bool(int(attr.get("outcome", 1))),
|
114 |
+
"start_x": start_x,
|
115 |
+
"start_y": start_y,
|
116 |
+
"end_x": end_x if end_x is not None else start_x,
|
117 |
+
"end_y": end_y if end_y is not None else start_y,
|
118 |
+
"qualifiers": qualifiers,
|
119 |
+
# Optional fields
|
120 |
+
"assist": bool(int(attr.get("assist", 0))),
|
121 |
+
"keypass": bool(int(attr.get("keypass", 0))),
|
122 |
+
}
|
123 |
+
return events
|
data/opta/parsers/f24_xml.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""XML parser for Opta F24 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
from lxml import objectify
|
7 |
+
|
8 |
+
from .base import OptaXMLParser, _get_end_x, _get_end_y, assertget
|
9 |
+
|
10 |
+
|
11 |
+
class F24XMLParser(OptaXMLParser):
|
12 |
+
"""Extract data from a Opta F24 data stream.
|
13 |
+
|
14 |
+
Parameters
|
15 |
+
----------
|
16 |
+
path : str
|
17 |
+
Path of the data file.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def _get_doc(self) -> objectify.ObjectifiedElement:
|
21 |
+
return self.root
|
22 |
+
|
23 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
24 |
+
"""Return a dictionary with all available games.
|
25 |
+
|
26 |
+
Returns
|
27 |
+
-------
|
28 |
+
dict
|
29 |
+
A mapping between game IDs and the information available about
|
30 |
+
each game in the data stream.
|
31 |
+
"""
|
32 |
+
optadocument = self._get_doc()
|
33 |
+
game_elem = optadocument.find("Game")
|
34 |
+
attr = game_elem.attrib
|
35 |
+
game_id = int(assertget(attr, "id"))
|
36 |
+
game_dict = {
|
37 |
+
# Fields required by the base schema
|
38 |
+
"game_id": game_id,
|
39 |
+
"season_id": int(assertget(attr, "season_id")),
|
40 |
+
"competition_id": int(assertget(attr, "competition_id")),
|
41 |
+
"game_day": int(assertget(attr, "matchday")),
|
42 |
+
"game_date": datetime.strptime(assertget(attr, "game_date"), "%Y-%m-%dT%H:%M:%S"),
|
43 |
+
"home_team_id": int(assertget(attr, "home_team_id")),
|
44 |
+
"away_team_id": int(assertget(attr, "away_team_id")),
|
45 |
+
# Optional fields
|
46 |
+
"home_score": int(assertget(attr, "home_score")),
|
47 |
+
"away_score": int(assertget(attr, "away_score")),
|
48 |
+
# duration=?
|
49 |
+
# referee=?
|
50 |
+
# venue=?
|
51 |
+
# attendance=?
|
52 |
+
# home_manager=?
|
53 |
+
# away_manager=?
|
54 |
+
}
|
55 |
+
return {game_id: game_dict}
|
56 |
+
|
57 |
+
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
|
58 |
+
"""Return a dictionary with all available events.
|
59 |
+
|
60 |
+
Returns
|
61 |
+
-------
|
62 |
+
dict
|
63 |
+
A mapping between (game ID, event ID) tuples and the information
|
64 |
+
available about each event in the data stream.
|
65 |
+
"""
|
66 |
+
optadocument = self._get_doc()
|
67 |
+
game_elm = optadocument.find("Game")
|
68 |
+
game_id = int(assertget(game_elm.attrib, "id"))
|
69 |
+
events = {}
|
70 |
+
for event_elm in game_elm.iterchildren("Event"):
|
71 |
+
attr = dict(event_elm.attrib)
|
72 |
+
event_id = int(assertget(attr, "id"))
|
73 |
+
|
74 |
+
qualifiers = {
|
75 |
+
int(qualifier_elm.attrib["qualifier_id"]): qualifier_elm.attrib.get("value")
|
76 |
+
for qualifier_elm in event_elm.iterchildren("Q")
|
77 |
+
}
|
78 |
+
start_x = float(assertget(attr, "x"))
|
79 |
+
start_y = float(assertget(attr, "y"))
|
80 |
+
end_x = _get_end_x(qualifiers)
|
81 |
+
end_y = _get_end_y(qualifiers)
|
82 |
+
|
83 |
+
events[(game_id, event_id)] = {
|
84 |
+
# Fields required by the base schema
|
85 |
+
"game_id": game_id,
|
86 |
+
"event_id": event_id,
|
87 |
+
"period_id": int(assertget(attr, "period_id")),
|
88 |
+
"team_id": int(assertget(attr, "team_id")),
|
89 |
+
"player_id": int(attr["player_id"]) if "player_id" in attr else None,
|
90 |
+
"type_id": int(assertget(attr, "type_id")),
|
91 |
+
# type_name=?, # added in the opta loader
|
92 |
+
# Fields required by the opta schema
|
93 |
+
"timestamp": datetime.strptime(
|
94 |
+
assertget(attr, "timestamp"), "%Y-%m-%dT%H:%M:%S.%f"
|
95 |
+
),
|
96 |
+
"minute": int(assertget(attr, "min")),
|
97 |
+
"second": int(assertget(attr, "sec")),
|
98 |
+
"outcome": bool(int(attr["outcome"])) if "outcome" in attr else None,
|
99 |
+
"start_x": start_x,
|
100 |
+
"start_y": start_y,
|
101 |
+
"end_x": end_x if end_x is not None else start_x,
|
102 |
+
"end_y": end_y if end_y is not None else start_y,
|
103 |
+
"qualifiers": qualifiers,
|
104 |
+
# Optional fields
|
105 |
+
"assist": bool(int(attr.get("assist", 0))),
|
106 |
+
"keypass": bool(int(attr.get("keypass", 0))),
|
107 |
+
}
|
108 |
+
return events
|
data/opta/parsers/f7_xml.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""XML parser for Opta F7 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
from lxml import objectify
|
7 |
+
|
8 |
+
from .base import OptaXMLParser, assertget
|
9 |
+
|
10 |
+
|
11 |
+
class F7XMLParser(OptaXMLParser):
|
12 |
+
"""Extract data from a Opta F7 data stream.
|
13 |
+
|
14 |
+
Parameters
|
15 |
+
----------
|
16 |
+
path : str
|
17 |
+
Path of the data file.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def _get_doc(self) -> objectify.ObjectifiedElement:
|
21 |
+
optadocument = self.root.find("SoccerDocument")
|
22 |
+
return optadocument
|
23 |
+
|
24 |
+
def _get_stats(self, obj: objectify.ObjectifiedElement) -> dict[str, Any]:
|
25 |
+
stats = {}
|
26 |
+
for stat in obj.find("Stat"):
|
27 |
+
stats[stat.attrib["Type"]] = stat.text
|
28 |
+
return stats
|
29 |
+
|
30 |
+
def _get_name(self, obj: objectify.ObjectifiedElement) -> str:
|
31 |
+
if "Known" in obj:
|
32 |
+
return obj.Known
|
33 |
+
return obj.First + " " + obj.Last
|
34 |
+
|
35 |
+
def extract_competitions(self) -> dict[tuple[int, int], dict[str, Any]]:
|
36 |
+
"""Return a dictionary with all available competitions.
|
37 |
+
|
38 |
+
Returns
|
39 |
+
-------
|
40 |
+
dict
|
41 |
+
A mapping between (competion ID, season ID) tuples and the
|
42 |
+
information available about each competition in the data stream.
|
43 |
+
"""
|
44 |
+
optadocument = self._get_doc()
|
45 |
+
competition = optadocument.Competition
|
46 |
+
competition_id = int(competition.attrib["uID"][1:])
|
47 |
+
stats = self._get_stats(competition)
|
48 |
+
season_id = int(assertget(stats, "season_id"))
|
49 |
+
competition_dict = {
|
50 |
+
# Fields required by the base schema
|
51 |
+
"competition_id": competition_id,
|
52 |
+
"season_id": season_id,
|
53 |
+
"season_name": assertget(stats, "season_name"),
|
54 |
+
"competition_name": competition.Name.text,
|
55 |
+
}
|
56 |
+
return {(competition_id, season_id): competition_dict}
|
57 |
+
|
58 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
59 |
+
"""Return a dictionary with all available games.
|
60 |
+
|
61 |
+
Returns
|
62 |
+
-------
|
63 |
+
dict
|
64 |
+
A mapping between game IDs and the information available about
|
65 |
+
each game in the data stream.
|
66 |
+
"""
|
67 |
+
optadocument = self._get_doc()
|
68 |
+
competition = optadocument.Competition
|
69 |
+
competition_id = int(competition.attrib["uID"][1:])
|
70 |
+
competition_stats = self._get_stats(competition)
|
71 |
+
match_info = optadocument.MatchData.MatchInfo
|
72 |
+
game_id = int(optadocument.attrib["uID"][1:])
|
73 |
+
stats = self._get_stats(optadocument.MatchData)
|
74 |
+
team_data_elms = {
|
75 |
+
t.attrib["Side"]: t for t in optadocument.MatchData.iterchildren("TeamData")
|
76 |
+
}
|
77 |
+
team_officials = {}
|
78 |
+
for t in optadocument.iterchildren("Team"):
|
79 |
+
side = (
|
80 |
+
"Home"
|
81 |
+
if int(team_data_elms["Home"].attrib["TeamRef"][1:]) == int(t.attrib["uID"][1:])
|
82 |
+
else "Away"
|
83 |
+
)
|
84 |
+
for m in t.iterchildren("TeamOfficial"):
|
85 |
+
if m.attrib["Type"] == "Manager":
|
86 |
+
team_officials[side] = m
|
87 |
+
|
88 |
+
game_dict = {
|
89 |
+
# Fields required by the base schema
|
90 |
+
"game_id": game_id,
|
91 |
+
"season_id": int(assertget(competition_stats, "season_id")),
|
92 |
+
"competition_id": competition_id,
|
93 |
+
"game_day": int(competition_stats["matchday"])
|
94 |
+
if "matchday" in competition_stats
|
95 |
+
else None,
|
96 |
+
"game_date": datetime.strptime(match_info.Date.text, "%Y%m%dT%H%M%S%z").replace(
|
97 |
+
tzinfo=None
|
98 |
+
),
|
99 |
+
"home_team_id": int(
|
100 |
+
assertget(assertget(team_data_elms, "Home").attrib, "TeamRef")[1:]
|
101 |
+
),
|
102 |
+
"away_team_id": int(
|
103 |
+
assertget(assertget(team_data_elms, "Away").attrib, "TeamRef")[1:]
|
104 |
+
),
|
105 |
+
# Optional fields
|
106 |
+
"home_score": int(assertget(assertget(team_data_elms, "Home").attrib, "Score")),
|
107 |
+
"away_score": int(assertget(assertget(team_data_elms, "Away").attrib, "Score")),
|
108 |
+
"duration": int(stats["match_time"]),
|
109 |
+
"referee": self._get_name(optadocument.MatchData.MatchOfficial.OfficialName),
|
110 |
+
"venue": optadocument.Venue.Name.text,
|
111 |
+
"attendance": int(match_info.Attendance),
|
112 |
+
"home_manager": self._get_name(team_officials["Home"].PersonName)
|
113 |
+
if "Home" in team_officials
|
114 |
+
else None,
|
115 |
+
"away_manager": self._get_name(team_officials["Away"].PersonName)
|
116 |
+
if "Away" in team_officials
|
117 |
+
else None,
|
118 |
+
}
|
119 |
+
return {game_id: game_dict}
|
120 |
+
|
121 |
+
def extract_teams(self) -> dict[int, dict[str, Any]]:
|
122 |
+
"""Return a dictionary with all available teams.
|
123 |
+
|
124 |
+
Returns
|
125 |
+
-------
|
126 |
+
dict
|
127 |
+
A mapping between team IDs and the information available about
|
128 |
+
each team in the data stream.
|
129 |
+
"""
|
130 |
+
optadocument = self._get_doc()
|
131 |
+
team_elms = list(optadocument.iterchildren("Team"))
|
132 |
+
teams = {}
|
133 |
+
for team_elm in team_elms:
|
134 |
+
team_id = int(assertget(team_elm.attrib, "uID")[1:])
|
135 |
+
teams[team_id] = {
|
136 |
+
# Fields required by the base schema
|
137 |
+
"team_id": team_id,
|
138 |
+
"team_name": team_elm.Name.text,
|
139 |
+
}
|
140 |
+
return teams
|
141 |
+
|
142 |
+
def extract_lineups(self) -> dict[int, dict[str, Any]]:
|
143 |
+
"""Return a dictionary with the lineup of each team.
|
144 |
+
|
145 |
+
Returns
|
146 |
+
-------
|
147 |
+
dict
|
148 |
+
A mapping between team IDs and the information available about
|
149 |
+
each team's lineup in the data stream.
|
150 |
+
"""
|
151 |
+
optadocument = self._get_doc()
|
152 |
+
|
153 |
+
stats = {}
|
154 |
+
for stat in optadocument.MatchData.find("Stat"):
|
155 |
+
stats[stat.attrib["Type"]] = stat.text
|
156 |
+
|
157 |
+
lineup_elms = optadocument.MatchData.iterchildren("TeamData")
|
158 |
+
lineups = {}
|
159 |
+
for team_elm in lineup_elms:
|
160 |
+
# lineup attributes
|
161 |
+
team_id = int(team_elm.attrib["TeamRef"][1:])
|
162 |
+
lineups[team_id] = {
|
163 |
+
"formation": team_elm.attrib["Formation"],
|
164 |
+
"score": int(team_elm.attrib["Score"]),
|
165 |
+
"side": team_elm.attrib["Side"],
|
166 |
+
"players": {},
|
167 |
+
}
|
168 |
+
# substitutes
|
169 |
+
subst_elms = team_elm.iterchildren("Substitution")
|
170 |
+
subst = [subst_elm.attrib for subst_elm in subst_elms]
|
171 |
+
# red_cards
|
172 |
+
booking_elms = team_elm.iterchildren("Booking")
|
173 |
+
red_cards = {
|
174 |
+
int(booking_elm.attrib["PlayerRef"][1:]): int(booking_elm.attrib["Min"])
|
175 |
+
for booking_elm in booking_elms
|
176 |
+
if "CardType" in booking_elm.attrib
|
177 |
+
and booking_elm.attrib["CardType"] in ["Red", "SecondYellow"]
|
178 |
+
and "PlayerRef" in booking_elm.attrib # not defined if a coach receives a red card
|
179 |
+
}
|
180 |
+
# players
|
181 |
+
player_elms = team_elm.PlayerLineUp.iterchildren("MatchPlayer")
|
182 |
+
for player_elm in player_elms:
|
183 |
+
player_id = int(player_elm.attrib["PlayerRef"][1:])
|
184 |
+
sub_on = int(
|
185 |
+
next(
|
186 |
+
(
|
187 |
+
item["Time"]
|
188 |
+
for item in subst
|
189 |
+
if "Retired" not in item and item["SubOn"] == f"p{player_id}"
|
190 |
+
),
|
191 |
+
stats["match_time"] if player_elm.attrib["Status"] == "Sub" else 0,
|
192 |
+
)
|
193 |
+
)
|
194 |
+
sub_off = int(
|
195 |
+
next(
|
196 |
+
(item["Time"] for item in subst if item["SubOff"] == f"p{player_id}"),
|
197 |
+
stats["match_time"]
|
198 |
+
if player_id not in red_cards
|
199 |
+
else red_cards[player_id],
|
200 |
+
)
|
201 |
+
)
|
202 |
+
minutes_played = sub_off - sub_on
|
203 |
+
lineups[team_id]["players"][player_id] = {
|
204 |
+
"starting_position_id": int(player_elm.attrib["Formation_Place"]),
|
205 |
+
"starting_position_name": player_elm.attrib["Position"],
|
206 |
+
"jersey_number": int(player_elm.attrib["ShirtNumber"]),
|
207 |
+
"is_starter": int(player_elm.attrib["Formation_Place"]) != 0,
|
208 |
+
"minutes_played": minutes_played,
|
209 |
+
}
|
210 |
+
return lineups
|
211 |
+
|
212 |
+
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
|
213 |
+
"""Return a dictionary with all available players.
|
214 |
+
|
215 |
+
Returns
|
216 |
+
-------
|
217 |
+
dict
|
218 |
+
A mapping between (game ID, player ID) tuples and the information
|
219 |
+
available about each player in the data stream.
|
220 |
+
"""
|
221 |
+
optadocument = self._get_doc()
|
222 |
+
game_id = int(optadocument.attrib["uID"][1:])
|
223 |
+
lineups = self.extract_lineups()
|
224 |
+
team_elms = list(optadocument.iterchildren("Team"))
|
225 |
+
players = {}
|
226 |
+
for team_elm in team_elms:
|
227 |
+
team_id = int(team_elm.attrib["uID"][1:])
|
228 |
+
for player_elm in team_elm.iterchildren("Player"):
|
229 |
+
player_id = int(player_elm.attrib["uID"][1:])
|
230 |
+
player = {
|
231 |
+
# Fields required by the base schema
|
232 |
+
"game_id": game_id,
|
233 |
+
"team_id": team_id,
|
234 |
+
"player_id": player_id,
|
235 |
+
"player_name": self._get_name(player_elm.PersonName),
|
236 |
+
"is_starter": lineups[team_id]["players"][player_id]["is_starter"],
|
237 |
+
"minutes_played": lineups[team_id]["players"][player_id]["minutes_played"],
|
238 |
+
"jersey_number": lineups[team_id]["players"][player_id]["jersey_number"],
|
239 |
+
# Fields required by the opta schema
|
240 |
+
"starting_position": lineups[team_id]["players"][player_id][
|
241 |
+
"starting_position_name"
|
242 |
+
],
|
243 |
+
# Optional fields
|
244 |
+
# height="?",
|
245 |
+
# weight="?",
|
246 |
+
# age="?",
|
247 |
+
}
|
248 |
+
players[(game_id, player_id)] = player
|
249 |
+
|
250 |
+
return players
|
data/opta/parsers/f9_json.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for Opta F9 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any, Optional
|
5 |
+
|
6 |
+
from ...base import MissingDataError
|
7 |
+
from .base import OptaJSONParser, assertget
|
8 |
+
|
9 |
+
|
10 |
+
class F9JSONParser(OptaJSONParser):
|
11 |
+
"""Extract data from a Opta F9 data stream.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
path : str
|
16 |
+
Path of the data file.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def _get_feed(self) -> dict[str, Any]:
|
20 |
+
for node in self.root:
|
21 |
+
if "OptaFeed" in node["data"].keys():
|
22 |
+
return node
|
23 |
+
raise MissingDataError
|
24 |
+
|
25 |
+
def _get_doc(self) -> dict[str, Any]:
|
26 |
+
f9 = self._get_feed()
|
27 |
+
data = assertget(f9, "data")
|
28 |
+
optafeed = assertget(data, "OptaFeed")
|
29 |
+
optadocument = assertget(optafeed, "OptaDocument")[0]
|
30 |
+
return optadocument
|
31 |
+
|
32 |
+
def _get_stats(self, obj: dict[str, Any]) -> dict[str, Any]:
|
33 |
+
if "Stat" not in obj:
|
34 |
+
return {}
|
35 |
+
|
36 |
+
stats = {}
|
37 |
+
statobj = obj["Stat"] if isinstance(obj["Stat"], list) else [obj["Stat"]]
|
38 |
+
for stat in statobj:
|
39 |
+
stats[stat["@attributes"]["Type"]] = stat["@value"]
|
40 |
+
return stats
|
41 |
+
|
42 |
+
def _get_name(self, obj: dict[str, Any]) -> Optional[str]:
|
43 |
+
if "Known" in obj and obj["Known"].strip():
|
44 |
+
return obj["Known"]
|
45 |
+
if "First" in obj and "Last" in obj and obj["Last"].strip() or obj["First"].strip():
|
46 |
+
return (obj["First"] + " " + obj["Last"]).strip()
|
47 |
+
return None
|
48 |
+
|
49 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
50 |
+
"""Return a dictionary with all available games.
|
51 |
+
|
52 |
+
Returns
|
53 |
+
-------
|
54 |
+
dict
|
55 |
+
A mapping between game IDs and the information available about
|
56 |
+
each game in the data stream.
|
57 |
+
"""
|
58 |
+
optadocument = self._get_doc()
|
59 |
+
attr = assertget(optadocument, "@attributes")
|
60 |
+
matchdata = assertget(optadocument, "MatchData")
|
61 |
+
competition = assertget(optadocument, "Competition")
|
62 |
+
competitionstat = self._get_stats(competition)
|
63 |
+
venue = assertget(optadocument, "Venue")
|
64 |
+
matchofficial = assertget(matchdata, "MatchOfficial")
|
65 |
+
matchinfo = assertget(matchdata, "MatchInfo")
|
66 |
+
matchstat = self._get_stats(matchdata)
|
67 |
+
teamdata = assertget(matchdata, "TeamData")
|
68 |
+
scores = {}
|
69 |
+
for t in teamdata:
|
70 |
+
scores[t["@attributes"]["Side"]] = t["@attributes"]["Score"]
|
71 |
+
|
72 |
+
game_id = int(assertget(attr, "uID")[1:])
|
73 |
+
game_dict = {
|
74 |
+
# Fields required by the base schema
|
75 |
+
"game_id": game_id,
|
76 |
+
"competition_id": int(assertget(assertget(competition, "@attributes"), "uID")[1:]),
|
77 |
+
"season_id": assertget(competitionstat, "season_id"),
|
78 |
+
"game_day": competitionstat["matchday"] if "matchday" in competitionstat else None,
|
79 |
+
"game_date": datetime.strptime(
|
80 |
+
assertget(matchinfo, "Date"), "%Y%m%dT%H%M%S%z"
|
81 |
+
).replace(tzinfo=None),
|
82 |
+
# home_team_id=see below
|
83 |
+
# away_team_id=see below
|
84 |
+
# Optional fields
|
85 |
+
"home_score": int(scores["Home"]),
|
86 |
+
"away_score": int(scores["Away"]),
|
87 |
+
"duration": int(assertget(matchstat, "match_time")),
|
88 |
+
"referee": self._get_name(matchofficial["OfficialName"])
|
89 |
+
if "OfficialName" in matchofficial
|
90 |
+
else None,
|
91 |
+
"venue": venue["Name"] if "Name" in venue else None,
|
92 |
+
"attendance": int(matchinfo["Attendance"]) if "Attendance" in matchinfo else None,
|
93 |
+
# home_manager=see below
|
94 |
+
# away_manager=see below
|
95 |
+
}
|
96 |
+
for team in teamdata:
|
97 |
+
teamattr = assertget(team, "@attributes")
|
98 |
+
side = assertget(teamattr, "Side")
|
99 |
+
teamid = assertget(teamattr, "TeamRef")
|
100 |
+
score = assertget(teamattr, "Score")
|
101 |
+
manager = (
|
102 |
+
self._get_name(team["TeamOfficial"]["PersonName"])
|
103 |
+
if "TeamOfficial" in team
|
104 |
+
else None
|
105 |
+
)
|
106 |
+
if side == "Home":
|
107 |
+
game_dict["home_team_id"] = int(teamid[1:])
|
108 |
+
game_dict["home_score"] = int(score)
|
109 |
+
game_dict["home_manager"] = manager
|
110 |
+
else:
|
111 |
+
game_dict["away_team_id"] = int(teamid[1:])
|
112 |
+
game_dict["away_score"] = int(score)
|
113 |
+
game_dict["away_manager"] = manager
|
114 |
+
return {game_id: game_dict}
|
115 |
+
|
116 |
+
def extract_teams(self) -> dict[int, dict[str, Any]]:
|
117 |
+
"""Return a dictionary with all available teams.
|
118 |
+
|
119 |
+
Returns
|
120 |
+
-------
|
121 |
+
dict
|
122 |
+
A mapping between team IDs and the information available about
|
123 |
+
each team in the data stream.
|
124 |
+
"""
|
125 |
+
optadocument = self._get_doc()
|
126 |
+
root_teams = assertget(optadocument, "Team")
|
127 |
+
|
128 |
+
teams = {}
|
129 |
+
for team in root_teams:
|
130 |
+
if "id" in team.keys():
|
131 |
+
nameobj = team.get("nameObj")
|
132 |
+
team_id = int(team["id"])
|
133 |
+
teams[team_id] = {
|
134 |
+
# Fields required by the base schema
|
135 |
+
"team_id": team_id,
|
136 |
+
"team_name": nameobj.get("name"),
|
137 |
+
}
|
138 |
+
return teams
|
139 |
+
|
140 |
+
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
|
141 |
+
"""Return a dictionary with all available players.
|
142 |
+
|
143 |
+
Returns
|
144 |
+
-------
|
145 |
+
dict
|
146 |
+
A mapping between (game ID, player ID) tuples and the information
|
147 |
+
available about each player in the data stream.
|
148 |
+
"""
|
149 |
+
optadocument = self._get_doc()
|
150 |
+
attr = assertget(optadocument, "@attributes")
|
151 |
+
game_id = int(assertget(attr, "uID")[1:])
|
152 |
+
root_teams = assertget(optadocument, "Team")
|
153 |
+
lineups = self.extract_lineups()
|
154 |
+
|
155 |
+
players = {}
|
156 |
+
for team in root_teams:
|
157 |
+
team_id = int(team["@attributes"]["uID"].replace("t", ""))
|
158 |
+
for player in team["Player"]:
|
159 |
+
player_id = int(player["@attributes"]["uID"].replace("p", ""))
|
160 |
+
|
161 |
+
assert "nameObj" in player["PersonName"]
|
162 |
+
nameobj = player["PersonName"]["nameObj"]
|
163 |
+
if not nameobj.get("is_unknown"):
|
164 |
+
player = {
|
165 |
+
# Fields required by the base schema
|
166 |
+
"game_id": game_id,
|
167 |
+
"team_id": team_id,
|
168 |
+
"player_id": player_id,
|
169 |
+
"player_name": self._get_name(player["PersonName"]),
|
170 |
+
# is_starter=
|
171 |
+
# minutes_played=
|
172 |
+
# jersey_number=
|
173 |
+
# Fields required by the opta schema
|
174 |
+
# starting_position=
|
175 |
+
# Optional fields
|
176 |
+
# height="?",
|
177 |
+
# weight="?",
|
178 |
+
# age="?",
|
179 |
+
}
|
180 |
+
if player_id in lineups[team_id]["players"]:
|
181 |
+
player = dict(
|
182 |
+
**player,
|
183 |
+
jersey_number=lineups[team_id]["players"][player_id]["jersey_number"],
|
184 |
+
starting_position=lineups[team_id]["players"][player_id][
|
185 |
+
"starting_position_name"
|
186 |
+
],
|
187 |
+
is_starter=lineups[team_id]["players"][player_id]["is_starter"],
|
188 |
+
minutes_played=lineups[team_id]["players"][player_id][
|
189 |
+
"minutes_played"
|
190 |
+
],
|
191 |
+
)
|
192 |
+
players[(game_id, player_id)] = player
|
193 |
+
return players
|
194 |
+
|
195 |
+
def extract_lineups(self) -> dict[int, dict[str, Any]]:
|
196 |
+
"""Return a dictionary with the lineup of each team.
|
197 |
+
|
198 |
+
Raises
|
199 |
+
------
|
200 |
+
MissingDataError
|
201 |
+
If teams data is not available in the stream.
|
202 |
+
|
203 |
+
Returns
|
204 |
+
-------
|
205 |
+
dict
|
206 |
+
A mapping between team IDs and the information available about
|
207 |
+
each team's lineup in the data stream.
|
208 |
+
"""
|
209 |
+
optadocument = self._get_doc()
|
210 |
+
attr = assertget(optadocument, "@attributes")
|
211 |
+
|
212 |
+
try:
|
213 |
+
rootf9 = optadocument["MatchData"]["TeamData"]
|
214 |
+
except KeyError as e:
|
215 |
+
raise MissingDataError from e
|
216 |
+
matchstats = optadocument["MatchData"]["Stat"]
|
217 |
+
matchstats = [matchstats] if isinstance(matchstats, dict) else matchstats
|
218 |
+
matchstatsdict = {stat["@attributes"]["Type"]: stat["@value"] for stat in matchstats}
|
219 |
+
|
220 |
+
lineups: dict[int, dict[str, Any]] = {}
|
221 |
+
for team in rootf9:
|
222 |
+
# lineup attributes
|
223 |
+
team_id = int(team["@attributes"]["TeamRef"].replace("t", ""))
|
224 |
+
lineups[team_id] = {"players": {}}
|
225 |
+
# substitutes
|
226 |
+
subst = [s["@attributes"] for s in team["Substitution"]]
|
227 |
+
# red cards
|
228 |
+
red_cards = {
|
229 |
+
int(e["@attributes"]["PlayerRef"].replace("p", "")): e["@attributes"]["Time"]
|
230 |
+
for e in team.get("Booking", [])
|
231 |
+
if "CardType" in e["@attributes"]
|
232 |
+
and e["@attributes"]["CardType"] in ["Red", "SecondYellow"]
|
233 |
+
and "PlayerRef" in e["@attributes"] # not defined if a coach receives a red card
|
234 |
+
}
|
235 |
+
for player in team["PlayerLineUp"]["MatchPlayer"]:
|
236 |
+
attr = player["@attributes"]
|
237 |
+
player_id = int(attr["PlayerRef"].replace("p", ""))
|
238 |
+
playerstatsdict = {
|
239 |
+
stat["@attributes"]["Type"]: stat["@value"] for stat in player["Stat"]
|
240 |
+
}
|
241 |
+
sub_on = next(
|
242 |
+
(
|
243 |
+
item["Time"]
|
244 |
+
for item in subst
|
245 |
+
if "Retired" not in item and item["SubOn"] == f"p{player_id}"
|
246 |
+
),
|
247 |
+
matchstatsdict["match_time"] if attr["Status"] == "Sub" else 0,
|
248 |
+
)
|
249 |
+
sub_off = next(
|
250 |
+
(item["Time"] for item in subst if item["SubOff"] == f"p{player_id}"),
|
251 |
+
matchstatsdict["match_time"]
|
252 |
+
if player_id not in red_cards
|
253 |
+
else red_cards[player_id],
|
254 |
+
)
|
255 |
+
minutes_played = sub_off - sub_on
|
256 |
+
lineups[team_id]["players"][player_id] = dict(
|
257 |
+
jersey_number=attr["ShirtNumber"],
|
258 |
+
starting_position_name=attr["Position"],
|
259 |
+
starting_position_id=attr["position_id"],
|
260 |
+
is_starter=attr["Status"] == "Start",
|
261 |
+
minutes_played=minutes_played,
|
262 |
+
**playerstatsdict,
|
263 |
+
)
|
264 |
+
return lineups
|
265 |
+
|
266 |
+
def extract_teamgamestats(self) -> list[dict[str, Any]]:
|
267 |
+
"""Return some aggregated statistics of each team.
|
268 |
+
|
269 |
+
Raises
|
270 |
+
------
|
271 |
+
MissingDataError
|
272 |
+
If teams data is not available in the stream.
|
273 |
+
|
274 |
+
Returns
|
275 |
+
-------
|
276 |
+
list(dict)
|
277 |
+
A dictionary with aggregated team statistics for each team.
|
278 |
+
"""
|
279 |
+
optadocument = self._get_doc()
|
280 |
+
attr = assertget(optadocument, "@attributes")
|
281 |
+
game_id = int(assertget(attr, "uID")[1:])
|
282 |
+
|
283 |
+
try:
|
284 |
+
rootf9 = optadocument["MatchData"]["TeamData"]
|
285 |
+
except KeyError as e:
|
286 |
+
raise MissingDataError from e
|
287 |
+
teams_gamestats = []
|
288 |
+
for team in rootf9:
|
289 |
+
attr = team["@attributes"]
|
290 |
+
statsdict = self._get_stats(team)
|
291 |
+
|
292 |
+
team_gamestats = dict(
|
293 |
+
game_id=game_id,
|
294 |
+
team_id=int(attr["TeamRef"].replace("t", "")),
|
295 |
+
side=attr["Side"],
|
296 |
+
score=attr["Score"],
|
297 |
+
shootout_score=attr["ShootOutScore"],
|
298 |
+
**statsdict,
|
299 |
+
)
|
300 |
+
|
301 |
+
teams_gamestats.append(team_gamestats)
|
302 |
+
return teams_gamestats
|
data/opta/parsers/ma1_json.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for Stats Perform MA1 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any, Optional
|
5 |
+
|
6 |
+
from ...base import MissingDataError
|
7 |
+
from .base import OptaJSONParser, assertget
|
8 |
+
|
9 |
+
|
10 |
+
class MA1JSONParser(OptaJSONParser):
|
11 |
+
"""Extract data from a Stats Perform MA1 data stream.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
path : str
|
16 |
+
Path of the data file.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def _get_matches(self) -> list[dict[str, Any]]:
|
20 |
+
if "matchInfo" in self.root:
|
21 |
+
return [self.root]
|
22 |
+
if "match" in self.root:
|
23 |
+
return self.root["match"]
|
24 |
+
raise MissingDataError
|
25 |
+
|
26 |
+
def _get_match_info(self, match: dict[str, Any]) -> dict[str, Any]:
|
27 |
+
if "matchInfo" in match:
|
28 |
+
return match["matchInfo"]
|
29 |
+
raise MissingDataError
|
30 |
+
|
31 |
+
def _get_live_data(self, match: dict[str, Any]) -> dict[str, Any]:
|
32 |
+
if "liveData" in match:
|
33 |
+
return match["liveData"]
|
34 |
+
return {}
|
35 |
+
|
36 |
+
def _get_name(self, obj: dict[str, Any]) -> Optional[str]:
|
37 |
+
if "name" in obj:
|
38 |
+
return assertget(obj, "name")
|
39 |
+
if "firstName" in obj:
|
40 |
+
return f"{assertget(obj, 'firstName')} {assertget(obj, 'lastName')}"
|
41 |
+
return None
|
42 |
+
|
43 |
+
@staticmethod
|
44 |
+
def _extract_team_id(teams: list[dict[str, str]], side: str) -> Optional[str]:
|
45 |
+
for team in teams:
|
46 |
+
team_side = assertget(team, "position")
|
47 |
+
if team_side == side:
|
48 |
+
team_id = assertget(team, "id")
|
49 |
+
return team_id
|
50 |
+
raise MissingDataError
|
51 |
+
|
52 |
+
def extract_competitions(self) -> dict[tuple[str, str], dict[str, Any]]:
|
53 |
+
"""Return a dictionary with all available competitions.
|
54 |
+
|
55 |
+
Returns
|
56 |
+
-------
|
57 |
+
dict
|
58 |
+
A mapping between (competion ID, season ID) tuples and the
|
59 |
+
information available about each competition in the data stream.
|
60 |
+
"""
|
61 |
+
competitions = {}
|
62 |
+
for match in self._get_matches():
|
63 |
+
match_info = self._get_match_info(match)
|
64 |
+
season = assertget(match_info, "tournamentCalendar")
|
65 |
+
season_id = assertget(season, "id")
|
66 |
+
competition = assertget(match_info, "competition")
|
67 |
+
competition_id = assertget(competition, "id")
|
68 |
+
competitions[(competition_id, season_id)] = {
|
69 |
+
"season_id": season_id,
|
70 |
+
"season_name": assertget(season, "name"),
|
71 |
+
"competition_id": competition_id,
|
72 |
+
"competition_name": assertget(competition, "name"),
|
73 |
+
}
|
74 |
+
return competitions
|
75 |
+
|
76 |
+
def extract_games(self) -> dict[str, dict[str, Any]]:
|
77 |
+
"""Return a dictionary with all available games.
|
78 |
+
|
79 |
+
Returns
|
80 |
+
-------
|
81 |
+
dict
|
82 |
+
A mapping between game IDs and the information available about
|
83 |
+
each game in the data stream.
|
84 |
+
"""
|
85 |
+
games = {}
|
86 |
+
for match in self._get_matches():
|
87 |
+
match_info = self._get_match_info(match)
|
88 |
+
game_id = assertget(match_info, "id")
|
89 |
+
season = assertget(match_info, "tournamentCalendar")
|
90 |
+
competition = assertget(match_info, "competition")
|
91 |
+
contestant = assertget(match_info, "contestant")
|
92 |
+
game_date = assertget(match_info, "date")
|
93 |
+
game_time = assertget(match_info, "time")
|
94 |
+
game_datetime = f"{game_date} {game_time}"
|
95 |
+
venue = assertget(match_info, "venue")
|
96 |
+
games[game_id] = {
|
97 |
+
# Fields required by the base schema
|
98 |
+
"game_id": game_id,
|
99 |
+
"competition_id": assertget(competition, "id"),
|
100 |
+
"season_id": assertget(season, "id"),
|
101 |
+
"game_day": int(match_info["week"]) if "week" in match_info else None,
|
102 |
+
"game_date": datetime.strptime(game_datetime, "%Y-%m-%dZ %H:%M:%SZ"),
|
103 |
+
"home_team_id": self._extract_team_id(contestant, "home"),
|
104 |
+
"away_team_id": self._extract_team_id(contestant, "away"),
|
105 |
+
# Optional fields
|
106 |
+
# home_score=?,
|
107 |
+
# away_score=?,
|
108 |
+
# duration=?,
|
109 |
+
# referee=?,
|
110 |
+
"venue": venue["shortName"] if "shortName" in venue else None,
|
111 |
+
# attendance=?,
|
112 |
+
# home_manager=?,
|
113 |
+
# away_manager=?,
|
114 |
+
}
|
115 |
+
live_data = self._get_live_data(match)
|
116 |
+
if "matchDetails" in live_data:
|
117 |
+
match_details = assertget(live_data, "matchDetails")
|
118 |
+
if "matchLengthMin" in match_details:
|
119 |
+
games[game_id]["duration"] = assertget(match_details, "matchLengthMin")
|
120 |
+
if "scores" in match_details:
|
121 |
+
scores = assertget(match_details, "scores")
|
122 |
+
games[game_id]["home_score"] = assertget(scores, "total")["home"]
|
123 |
+
games[game_id]["away_score"] = assertget(scores, "total")["away"]
|
124 |
+
if "matchDetailsExtra" in live_data:
|
125 |
+
extra_match_details = assertget(live_data, "matchDetailsExtra")
|
126 |
+
if "attendance" in extra_match_details:
|
127 |
+
games[game_id]["attendance"] = int(
|
128 |
+
assertget(extra_match_details, "attendance")
|
129 |
+
)
|
130 |
+
if "matchOfficial" in extra_match_details:
|
131 |
+
for official in assertget(extra_match_details, "matchOfficial"):
|
132 |
+
if official["type"] == "Main":
|
133 |
+
games[game_id]["referee"] = self._get_name(official)
|
134 |
+
return games
|
135 |
+
|
136 |
+
def extract_teams(self) -> dict[str, dict[str, Any]]:
|
137 |
+
"""Return a dictionary with all available teams.
|
138 |
+
|
139 |
+
Returns
|
140 |
+
-------
|
141 |
+
dict
|
142 |
+
A mapping between team IDs and the information available about
|
143 |
+
each team in the data stream.
|
144 |
+
"""
|
145 |
+
teams = {}
|
146 |
+
for match in self._get_matches():
|
147 |
+
match_info = self._get_match_info(match)
|
148 |
+
contestants = assertget(match_info, "contestant")
|
149 |
+
for contestant in contestants:
|
150 |
+
team_id = assertget(contestant, "id")
|
151 |
+
team = {
|
152 |
+
"team_id": team_id,
|
153 |
+
"team_name": assertget(contestant, "name"),
|
154 |
+
}
|
155 |
+
teams[team_id] = team
|
156 |
+
return teams
|
157 |
+
|
158 |
+
def extract_players(self) -> dict[tuple[str, str], dict[str, Any]]: # noqa: C901
|
159 |
+
"""Return a dictionary with all available players.
|
160 |
+
|
161 |
+
Returns
|
162 |
+
-------
|
163 |
+
dict
|
164 |
+
A mapping between player IDs and the information available about
|
165 |
+
each player in the data stream.
|
166 |
+
"""
|
167 |
+
players = {}
|
168 |
+
subs = self.extract_substitutions()
|
169 |
+
for match in self._get_matches():
|
170 |
+
match_info = self._get_match_info(match)
|
171 |
+
game_id = assertget(match_info, "id")
|
172 |
+
live_data = self._get_live_data(match)
|
173 |
+
if "lineUp" not in live_data:
|
174 |
+
continue
|
175 |
+
red_cards = {
|
176 |
+
e["playerId"]: e["timeMin"]
|
177 |
+
for e in live_data.get("card", [])
|
178 |
+
if "type" in e
|
179 |
+
and e["type"] in ["Y2C", "RC"]
|
180 |
+
and "playerId" in e # not defined if a coach receives a red card
|
181 |
+
}
|
182 |
+
lineups = assertget(live_data, "lineUp")
|
183 |
+
for lineup in lineups:
|
184 |
+
team_id = assertget(lineup, "contestantId")
|
185 |
+
players_in_lineup = assertget(lineup, "player")
|
186 |
+
for individual in players_in_lineup:
|
187 |
+
player_id = assertget(individual, "playerId")
|
188 |
+
players[(game_id, player_id)] = {
|
189 |
+
# Fields required by the base schema
|
190 |
+
"game_id": game_id,
|
191 |
+
"team_id": team_id,
|
192 |
+
"player_id": player_id,
|
193 |
+
"player_name": self._get_name(individual),
|
194 |
+
"is_starter": assertget(individual, "position") != "Substitute",
|
195 |
+
# minutes_played="?",
|
196 |
+
"jersey_number": assertget(individual, "shirtNumber"),
|
197 |
+
# Fields required by the opta schema
|
198 |
+
"starting_position": assertget(individual, "position"),
|
199 |
+
}
|
200 |
+
if "matchDetails" in live_data and "substitute" in live_data:
|
201 |
+
match_details = assertget(live_data, "matchDetails")
|
202 |
+
if "matchLengthMin" not in match_details:
|
203 |
+
continue
|
204 |
+
# Determine when player entered the pitch
|
205 |
+
is_starter = assertget(individual, "position") != "Substitute"
|
206 |
+
sub_in = [
|
207 |
+
s
|
208 |
+
for s in subs.values()
|
209 |
+
if s["game_id"] == game_id and s["player_in_id"] == player_id
|
210 |
+
]
|
211 |
+
if is_starter:
|
212 |
+
minute_start = 0
|
213 |
+
elif len(sub_in) == 1:
|
214 |
+
minute_start = sub_in[0]["minute"]
|
215 |
+
else:
|
216 |
+
minute_start = None
|
217 |
+
# Determine when player left the pitch
|
218 |
+
sub_out = [
|
219 |
+
s
|
220 |
+
for s in subs.values()
|
221 |
+
if s["game_id"] == game_id and s["player_out_id"] == player_id
|
222 |
+
]
|
223 |
+
duration = assertget(match_details, "matchLengthMin")
|
224 |
+
minute_end = duration
|
225 |
+
if len(sub_out) == 1:
|
226 |
+
minute_end = sub_out[0]["minute"]
|
227 |
+
elif player_id in red_cards:
|
228 |
+
minute_end = red_cards[player_id]
|
229 |
+
# Determin time on the pitch
|
230 |
+
if is_starter or minute_start is not None:
|
231 |
+
players[(game_id, player_id)]["minutes_played"] = (
|
232 |
+
minute_end - minute_start
|
233 |
+
)
|
234 |
+
else:
|
235 |
+
players[(game_id, player_id)]["minutes_played"] = 0
|
236 |
+
return players
|
237 |
+
|
238 |
+
def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]:
|
239 |
+
"""Return a dictionary with all substitution events.
|
240 |
+
|
241 |
+
Returns
|
242 |
+
-------
|
243 |
+
dict
|
244 |
+
A mapping between (game ID, player ID) tuples and the information
|
245 |
+
available about each substitution in the data stream.
|
246 |
+
"""
|
247 |
+
subs = {}
|
248 |
+
for match in self._get_matches():
|
249 |
+
match_info = self._get_match_info(match)
|
250 |
+
game_id = assertget(match_info, "id")
|
251 |
+
live_data = self._get_live_data(match)
|
252 |
+
if "substitute" not in live_data:
|
253 |
+
continue
|
254 |
+
for e in assertget(live_data, "substitute"):
|
255 |
+
sub_id = assertget(e, "playerOnId")
|
256 |
+
subs[(game_id, sub_id)] = {
|
257 |
+
"game_id": game_id,
|
258 |
+
"team_id": assertget(e, "contestantId"),
|
259 |
+
"period_id": int(assertget(e, "periodId")),
|
260 |
+
"minute": int(assertget(e, "timeMin")),
|
261 |
+
"player_in_id": assertget(e, "playerOnId"),
|
262 |
+
"player_out_id": assertget(e, "playerOffId"),
|
263 |
+
}
|
264 |
+
return subs
|
data/opta/parsers/ma3_json.py
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for Stats Perform MA3 feeds."""
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Any, Optional
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from ...base import MissingDataError
|
9 |
+
from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget
|
10 |
+
|
11 |
+
|
12 |
+
class MA3JSONParser(OptaJSONParser):
|
13 |
+
"""Extract data from a Stats Perform MA3 data stream.
|
14 |
+
|
15 |
+
Parameters
|
16 |
+
----------
|
17 |
+
path : str
|
18 |
+
Path of the data file.
|
19 |
+
"""
|
20 |
+
|
21 |
+
_position_map = {
|
22 |
+
1: "Goalkeeper",
|
23 |
+
2: "Defender",
|
24 |
+
3: "Midfielder",
|
25 |
+
4: "Forward",
|
26 |
+
5: "Substitute",
|
27 |
+
}
|
28 |
+
|
29 |
+
def _get_match_info(self) -> dict[str, Any]:
|
30 |
+
if "matchInfo" in self.root:
|
31 |
+
return self.root["matchInfo"]
|
32 |
+
raise MissingDataError
|
33 |
+
|
34 |
+
def _get_live_data(self) -> dict[str, Any]:
|
35 |
+
if "liveData" in self.root:
|
36 |
+
return self.root["liveData"]
|
37 |
+
raise MissingDataError
|
38 |
+
|
39 |
+
def extract_competitions(self) -> dict[tuple[str, str], dict[str, Any]]:
|
40 |
+
"""Return a dictionary with all available competitions.
|
41 |
+
|
42 |
+
Returns
|
43 |
+
-------
|
44 |
+
dict
|
45 |
+
A mapping between competion IDs and the information available about
|
46 |
+
each competition in the data stream.
|
47 |
+
"""
|
48 |
+
match_info = self._get_match_info()
|
49 |
+
season = assertget(match_info, "tournamentCalendar")
|
50 |
+
competition = assertget(match_info, "competition")
|
51 |
+
competition_id = assertget(competition, "id")
|
52 |
+
season_id = assertget(season, "id")
|
53 |
+
season = {
|
54 |
+
# Fields required by the base schema
|
55 |
+
"season_id": season_id,
|
56 |
+
"season_name": assertget(season, "name"),
|
57 |
+
"competition_id": competition_id,
|
58 |
+
"competition_name": assertget(competition, "name"),
|
59 |
+
}
|
60 |
+
return {(competition_id, season_id): season}
|
61 |
+
|
62 |
+
def extract_games(self) -> dict[str, dict[str, Any]]:
|
63 |
+
"""Return a dictionary with all available games.
|
64 |
+
|
65 |
+
Returns
|
66 |
+
-------
|
67 |
+
dict
|
68 |
+
A mapping between game IDs and the information available about
|
69 |
+
each game in the data stream.
|
70 |
+
"""
|
71 |
+
match_info = self._get_match_info()
|
72 |
+
game_id = assertget(match_info, "id")
|
73 |
+
season = assertget(match_info, "tournamentCalendar")
|
74 |
+
competition = assertget(match_info, "competition")
|
75 |
+
contestant = assertget(match_info, "contestant")
|
76 |
+
game_date = assertget(match_info, "date")[0:10]
|
77 |
+
game_time = assertget(match_info, "time")[0:8]
|
78 |
+
game_datetime = f"{game_date}T{game_time}"
|
79 |
+
venue = assertget(match_info, "venue")
|
80 |
+
game_obj = {
|
81 |
+
"game_id": game_id,
|
82 |
+
"competition_id": assertget(competition, "id"),
|
83 |
+
"season_id": assertget(season, "id"),
|
84 |
+
"game_day": int(match_info["week"]) if "week" in match_info else None,
|
85 |
+
"game_date": datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"),
|
86 |
+
"home_team_id": self._extract_team_id(contestant, "home"),
|
87 |
+
"away_team_id": self._extract_team_id(contestant, "away"),
|
88 |
+
"venue": assertget(venue, "shortName"),
|
89 |
+
}
|
90 |
+
live_data = self._get_live_data()
|
91 |
+
if "matchDetails" in live_data:
|
92 |
+
match_details = assertget(live_data, "matchDetails")
|
93 |
+
if "matchLengthMin" in match_details:
|
94 |
+
game_obj["duration"] = assertget(match_details, "matchLengthMin")
|
95 |
+
if "scores" in match_details:
|
96 |
+
scores = assertget(match_details, "scores")
|
97 |
+
game_obj["home_score"] = assertget(scores, "total")["home"]
|
98 |
+
game_obj["away_score"] = assertget(scores, "total")["away"]
|
99 |
+
|
100 |
+
return {game_id: game_obj}
|
101 |
+
|
102 |
+
def extract_teams(self) -> dict[str, dict[str, Any]]:
|
103 |
+
"""Return a dictionary with all available teams.
|
104 |
+
|
105 |
+
Returns
|
106 |
+
-------
|
107 |
+
dict
|
108 |
+
A mapping between team IDs and the information available about
|
109 |
+
each team in the data stream.
|
110 |
+
"""
|
111 |
+
match_info = self._get_match_info()
|
112 |
+
contestants = assertget(match_info, "contestant")
|
113 |
+
teams = {}
|
114 |
+
for contestant in contestants:
|
115 |
+
team_id = assertget(contestant, "id")
|
116 |
+
team = {
|
117 |
+
# Fields required by the base schema
|
118 |
+
"team_id": team_id,
|
119 |
+
"team_name": assertget(contestant, "name"),
|
120 |
+
}
|
121 |
+
teams[team_id] = team
|
122 |
+
return teams
|
123 |
+
|
124 |
+
def extract_players(self) -> dict[tuple[str, str], dict[str, Any]]: # noqa: C901
|
125 |
+
"""Return a dictionary with all available players.
|
126 |
+
|
127 |
+
Returns
|
128 |
+
-------
|
129 |
+
dict
|
130 |
+
A mapping between (game ID, player ID) tuples and the information
|
131 |
+
available about each player in the data stream.
|
132 |
+
"""
|
133 |
+
match_info = self._get_match_info()
|
134 |
+
game_id = assertget(match_info, "id")
|
135 |
+
live_data = self._get_live_data()
|
136 |
+
events = assertget(live_data, "event")
|
137 |
+
|
138 |
+
game_duration = self._extract_duration()
|
139 |
+
playerid_to_name = {}
|
140 |
+
|
141 |
+
players_data: dict[str, list[Any]] = {
|
142 |
+
"starting_position_id": [],
|
143 |
+
"player_id": [],
|
144 |
+
"team_id": [],
|
145 |
+
"position_in_formation": [],
|
146 |
+
"jersey_number": [],
|
147 |
+
}
|
148 |
+
red_cards = {}
|
149 |
+
|
150 |
+
for event in events:
|
151 |
+
event_type = assertget(event, "typeId")
|
152 |
+
if event_type == 34:
|
153 |
+
team_id = assertget(event, "contestantId")
|
154 |
+
qualifiers = assertget(event, "qualifier")
|
155 |
+
for q in qualifiers:
|
156 |
+
qualifier_id = assertget(q, "qualifierId")
|
157 |
+
value = assertget(q, "value")
|
158 |
+
value = value.split(", ")
|
159 |
+
if qualifier_id == 30:
|
160 |
+
players_data["player_id"] += value
|
161 |
+
team = [team_id for _ in range(len(value))]
|
162 |
+
players_data["team_id"] += team
|
163 |
+
elif qualifier_id == 44:
|
164 |
+
value = [int(v) for v in value]
|
165 |
+
players_data["starting_position_id"] += value
|
166 |
+
elif qualifier_id == 131:
|
167 |
+
value = [int(v) for v in value]
|
168 |
+
players_data["position_in_formation"] += value
|
169 |
+
elif qualifier_id == 59:
|
170 |
+
value = [int(v) for v in value]
|
171 |
+
players_data["jersey_number"] += value
|
172 |
+
elif event_type == 17 and "playerId" in event:
|
173 |
+
qualifiers = assertget(event, "qualifier")
|
174 |
+
for q in qualifiers:
|
175 |
+
qualifier_id = assertget(q, "qualifierId")
|
176 |
+
if qualifier_id in [32, 33]:
|
177 |
+
red_cards[event["playerId"]] = event["timeMin"]
|
178 |
+
|
179 |
+
player_id = event.get("playerId")
|
180 |
+
if player_id is None:
|
181 |
+
continue
|
182 |
+
player_name = assertget(event, "playerName")
|
183 |
+
if player_id not in playerid_to_name:
|
184 |
+
playerid_to_name[player_id] = player_name
|
185 |
+
|
186 |
+
df_players_data = pd.DataFrame.from_dict(players_data) # type: ignore
|
187 |
+
|
188 |
+
substitutions = list(self.extract_substitutions().values())
|
189 |
+
substitutions_columns = ["player_id", "team_id", "minute_start", "minute_end"]
|
190 |
+
df_substitutions = pd.DataFrame(substitutions, columns=substitutions_columns)
|
191 |
+
df_substitutions = df_substitutions.groupby(["player_id", "team_id"]).max().reset_index()
|
192 |
+
df_substitutions["minute_start"] = df_substitutions["minute_start"].fillna(0)
|
193 |
+
df_substitutions["minute_end"] = df_substitutions["minute_end"].fillna(game_duration)
|
194 |
+
|
195 |
+
if df_substitutions.empty:
|
196 |
+
df_players_data["minute_start"] = 0
|
197 |
+
df_players_data["minute_end"] = game_duration
|
198 |
+
else:
|
199 |
+
df_players_data = df_players_data.merge(
|
200 |
+
df_substitutions, on=["team_id", "player_id"], how="left"
|
201 |
+
)
|
202 |
+
df_players_data["minute_end"] = df_players_data.apply(
|
203 |
+
lambda row: red_cards[row["player_id"]]
|
204 |
+
if row["player_id"] in red_cards
|
205 |
+
else row["minute_end"],
|
206 |
+
axis=1,
|
207 |
+
)
|
208 |
+
|
209 |
+
df_players_data["is_starter"] = df_players_data["position_in_formation"] > 0
|
210 |
+
df_players_data.loc[
|
211 |
+
df_players_data["is_starter"] & df_players_data["minute_start"].isnull(),
|
212 |
+
"minute_start",
|
213 |
+
] = 0
|
214 |
+
df_players_data.loc[
|
215 |
+
df_players_data["is_starter"] & df_players_data["minute_end"].isnull(), "minute_end"
|
216 |
+
] = game_duration
|
217 |
+
|
218 |
+
df_players_data["minutes_played"] = (
|
219 |
+
(df_players_data["minute_end"] - df_players_data["minute_start"]).fillna(0).astype(int)
|
220 |
+
)
|
221 |
+
|
222 |
+
players = {}
|
223 |
+
for _, player in df_players_data.iterrows():
|
224 |
+
if player.minutes_played > 0:
|
225 |
+
players[(game_id, player.player_id)] = {
|
226 |
+
# Fields required by the base schema
|
227 |
+
"game_id": game_id,
|
228 |
+
"team_id": player.team_id,
|
229 |
+
"player_id": player.player_id,
|
230 |
+
"player_name": playerid_to_name[player.player_id],
|
231 |
+
"is_starter": player.is_starter,
|
232 |
+
"minutes_played": player.minutes_played,
|
233 |
+
"jersey_number": player.jersey_number,
|
234 |
+
# Fields required by the opta schema
|
235 |
+
"starting_position": self._position_map.get(
|
236 |
+
player.starting_position_id, "Unknown"
|
237 |
+
),
|
238 |
+
}
|
239 |
+
return players
|
240 |
+
|
241 |
+
def extract_events(self) -> dict[tuple[str, int], dict[str, Any]]:
|
242 |
+
"""Return a dictionary with all available events.
|
243 |
+
|
244 |
+
Returns
|
245 |
+
-------
|
246 |
+
dict
|
247 |
+
A mapping between (game ID, event ID) tuples and the information
|
248 |
+
available about each event in the data stream.
|
249 |
+
"""
|
250 |
+
match_info = self._get_match_info()
|
251 |
+
live_data = self._get_live_data()
|
252 |
+
game_id = assertget(match_info, "id")
|
253 |
+
|
254 |
+
events = {}
|
255 |
+
for element in assertget(live_data, "event"):
|
256 |
+
timestamp_string = assertget(element, "timeStamp")
|
257 |
+
timestamp = self._convert_timestamp(timestamp_string)
|
258 |
+
|
259 |
+
qualifiers = {
|
260 |
+
int(q["qualifierId"]): q.get("value") for q in element.get("qualifier", [])
|
261 |
+
}
|
262 |
+
start_x = float(assertget(element, "x"))
|
263 |
+
start_y = float(assertget(element, "y"))
|
264 |
+
end_x = _get_end_x(qualifiers)
|
265 |
+
end_y = _get_end_y(qualifiers)
|
266 |
+
|
267 |
+
event_id = int(assertget(element, "id"))
|
268 |
+
event = {
|
269 |
+
# Fields required by the base schema
|
270 |
+
"game_id": game_id,
|
271 |
+
"event_id": event_id,
|
272 |
+
"period_id": int(assertget(element, "periodId")),
|
273 |
+
"team_id": assertget(element, "contestantId"),
|
274 |
+
"player_id": element.get("playerId"),
|
275 |
+
"type_id": int(assertget(element, "typeId")),
|
276 |
+
# Fields required by the opta schema
|
277 |
+
"timestamp": timestamp,
|
278 |
+
"minute": int(assertget(element, "timeMin")),
|
279 |
+
"second": int(assertget(element, "timeSec")),
|
280 |
+
"outcome": bool(int(element.get("outcome", 1))),
|
281 |
+
"start_x": start_x,
|
282 |
+
"start_y": start_y,
|
283 |
+
"end_x": end_x if end_x is not None else start_x,
|
284 |
+
"end_y": end_y if end_y is not None else start_y,
|
285 |
+
"qualifiers": qualifiers,
|
286 |
+
# Optional fields
|
287 |
+
"assist": bool(int(element.get("assist", 0))),
|
288 |
+
"keypass": bool(int(element.get("keyPass", 0))),
|
289 |
+
}
|
290 |
+
events[(game_id, event_id)] = event
|
291 |
+
return events
|
292 |
+
|
293 |
+
def extract_substitutions(self) -> dict[int, dict[str, Any]]:
|
294 |
+
"""Return a dictionary with all substitution events.
|
295 |
+
|
296 |
+
Returns
|
297 |
+
-------
|
298 |
+
dict
|
299 |
+
A mapping between player IDs and the information available about
|
300 |
+
each substitution in the data stream.
|
301 |
+
"""
|
302 |
+
live_data = self._get_live_data()
|
303 |
+
events = assertget(live_data, "event")
|
304 |
+
|
305 |
+
subs = {}
|
306 |
+
for e in events:
|
307 |
+
event_type = assertget(e, "typeId")
|
308 |
+
if event_type in (18, 19):
|
309 |
+
sub_id = assertget(e, "playerId")
|
310 |
+
substitution_data = {
|
311 |
+
"player_id": assertget(e, "playerId"),
|
312 |
+
"team_id": assertget(e, "contestantId"),
|
313 |
+
}
|
314 |
+
if event_type == 18:
|
315 |
+
substitution_data["minute_end"] = assertget(e, "timeMin")
|
316 |
+
else:
|
317 |
+
substitution_data["minute_start"] = assertget(e, "timeMin")
|
318 |
+
subs[sub_id] = substitution_data
|
319 |
+
return subs
|
320 |
+
|
321 |
+
def _extract_duration(self) -> int:
|
322 |
+
live_data = self._get_live_data()
|
323 |
+
events = assertget(live_data, "event")
|
324 |
+
|
325 |
+
game_duration = 90
|
326 |
+
|
327 |
+
for event in events:
|
328 |
+
event_type = assertget(event, "typeId")
|
329 |
+
if event_type == 30:
|
330 |
+
# todo: add 1st half time
|
331 |
+
qualifiers = assertget(event, "qualifier")
|
332 |
+
for q in qualifiers:
|
333 |
+
qualifier = assertget(q, "qualifierId")
|
334 |
+
if qualifier == 209:
|
335 |
+
new_duration = assertget(event, "timeMin")
|
336 |
+
if new_duration > game_duration:
|
337 |
+
game_duration = new_duration
|
338 |
+
|
339 |
+
return game_duration
|
340 |
+
|
341 |
+
@staticmethod
|
342 |
+
def _extract_team_id(teams: list[dict[str, str]], side: str) -> Optional[str]:
|
343 |
+
for team in teams:
|
344 |
+
team_side = assertget(team, "position")
|
345 |
+
if team_side == side:
|
346 |
+
team_id = assertget(team, "id")
|
347 |
+
return team_id
|
348 |
+
raise MissingDataError
|
349 |
+
|
350 |
+
@staticmethod
|
351 |
+
def _convert_timestamp(timestamp_string: str) -> datetime:
|
352 |
+
try:
|
353 |
+
return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%S.%fZ")
|
354 |
+
except ValueError:
|
355 |
+
return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%SZ")
|
data/opta/parsers/whoscored.py
ADDED
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""JSON parser for WhoScored feeds."""
|
2 |
+
|
3 |
+
import json # type: ignore
|
4 |
+
import re
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
from typing import Any, Optional
|
7 |
+
|
8 |
+
from ...base import MissingDataError
|
9 |
+
from .base import OptaParser, _get_end_x, _get_end_y, assertget
|
10 |
+
|
11 |
+
|
12 |
+
def _position_mapping(formation: str, x: float, y: float) -> str:
|
13 |
+
if x == 0 and y == 5:
|
14 |
+
return "GK"
|
15 |
+
return "Unknown"
|
16 |
+
|
17 |
+
|
18 |
+
class WhoScoredParser(OptaParser):
|
19 |
+
"""Extract data from a JSON data stream scraped from WhoScored.
|
20 |
+
|
21 |
+
Parameters
|
22 |
+
----------
|
23 |
+
path : str
|
24 |
+
Path of the data file.
|
25 |
+
competition_id : int
|
26 |
+
ID of the competition to which the provided data file belongs. If
|
27 |
+
None, this information is extracted from a field 'competition_id' in
|
28 |
+
the JSON.
|
29 |
+
season_id : int
|
30 |
+
ID of the season to which the provided data file belongs. If None,
|
31 |
+
this information is extracted from a field 'season_id' in the JSON.
|
32 |
+
game_id : int
|
33 |
+
ID of the game to which the provided data file belongs. If None, this
|
34 |
+
information is extracted from a field 'game_id' in the JSON.
|
35 |
+
"""
|
36 |
+
|
37 |
+
def __init__( # noqa: C901
|
38 |
+
self,
|
39 |
+
path: str,
|
40 |
+
competition_id: Optional[int] = None,
|
41 |
+
season_id: Optional[int] = None,
|
42 |
+
game_id: Optional[int] = None,
|
43 |
+
) -> None:
|
44 |
+
with open(path, encoding="utf-8") as fh:
|
45 |
+
self.root = json.load(fh)
|
46 |
+
|
47 |
+
if competition_id is None:
|
48 |
+
try:
|
49 |
+
competition_id = int(assertget(self.root, "competition_id"))
|
50 |
+
except AssertionError as e:
|
51 |
+
raise MissingDataError(
|
52 |
+
"""Could not determine the competition id. Add it to the
|
53 |
+
file path or include a field 'competition_id' in the
|
54 |
+
JSON."""
|
55 |
+
) from e
|
56 |
+
self.competition_id = competition_id
|
57 |
+
|
58 |
+
if season_id is None:
|
59 |
+
try:
|
60 |
+
season_id = int(assertget(self.root, "season_id"))
|
61 |
+
except AssertionError as e:
|
62 |
+
raise MissingDataError(
|
63 |
+
"""Could not determine the season id. Add it to the file
|
64 |
+
path or include a field 'season_id' in the JSON."""
|
65 |
+
) from e
|
66 |
+
self.season_id = season_id
|
67 |
+
|
68 |
+
if game_id is None:
|
69 |
+
try:
|
70 |
+
game_id = int(assertget(self.root, "game_id"))
|
71 |
+
except AssertionError as e:
|
72 |
+
raise MissingDataError(
|
73 |
+
"""Could not determine the game id. Add it to the file
|
74 |
+
path or include a field 'game_id' in the JSON."""
|
75 |
+
) from e
|
76 |
+
self.game_id = game_id
|
77 |
+
|
78 |
+
def _get_period_id(self, event: dict[str, Any]) -> int:
|
79 |
+
period = assertget(event, "period")
|
80 |
+
period_id = int(assertget(period, "value"))
|
81 |
+
return period_id
|
82 |
+
|
83 |
+
def _get_period_milliseconds(self, event: dict[str, Any]) -> int:
|
84 |
+
period_minute_limits = assertget(self.root, "periodMinuteLimits")
|
85 |
+
period_id = self._get_period_id(event)
|
86 |
+
if period_id == 16: # Pre-match
|
87 |
+
return 0
|
88 |
+
if period_id == 14: # Post-game
|
89 |
+
return 0
|
90 |
+
minute = int(assertget(event, "minute"))
|
91 |
+
period_minute = minute
|
92 |
+
if period_id > 1:
|
93 |
+
period_minute = minute - period_minute_limits[str(period_id - 1)]
|
94 |
+
period_second = period_minute * 60 + int(event.get("second", 0))
|
95 |
+
return period_second * 1000
|
96 |
+
|
97 |
+
def extract_games(self) -> dict[int, dict[str, Any]]:
|
98 |
+
"""Return a dictionary with all available games.
|
99 |
+
|
100 |
+
Returns
|
101 |
+
-------
|
102 |
+
dict
|
103 |
+
A mapping between game IDs and the information available about
|
104 |
+
each game in the data stream.
|
105 |
+
"""
|
106 |
+
team_home = assertget(self.root, "home")
|
107 |
+
team_away = assertget(self.root, "away")
|
108 |
+
game_dict = {
|
109 |
+
# Fields required by the base schema
|
110 |
+
"game_id": self.game_id,
|
111 |
+
"season_id": self.season_id,
|
112 |
+
"competition_id": self.competition_id,
|
113 |
+
"game_day": None, # Cannot be determined from the data stream
|
114 |
+
"game_date": datetime.strptime(
|
115 |
+
assertget(self.root, "startTime"), "%Y-%m-%dT%H:%M:%S"
|
116 |
+
), # Dates are UTC
|
117 |
+
"home_team_id": int(assertget(team_home, "teamId")),
|
118 |
+
"away_team_id": int(assertget(team_away, "teamId")),
|
119 |
+
# Optional fields
|
120 |
+
"home_score": int(assertget(assertget(self.root["home"], "scores"), "running")),
|
121 |
+
"away_score": int(assertget(assertget(self.root["away"], "scores"), "running")),
|
122 |
+
"duration": int(self.root.get("expandedMaxMinute"))
|
123 |
+
if "expandedMaxMinute" in self.root
|
124 |
+
else None,
|
125 |
+
"referee": self.root.get("referee", {}).get("name"),
|
126 |
+
"venue": self.root.get("venueName"),
|
127 |
+
"attendance": int(self.root.get("attendance")) if "attendance" in self.root else None,
|
128 |
+
"home_manager": team_home.get("managerName"),
|
129 |
+
"away_manager": team_away.get("managerName"),
|
130 |
+
}
|
131 |
+
return {self.game_id: game_dict}
|
132 |
+
|
133 |
+
def extract_teams(self) -> dict[int, dict[str, Any]]:
|
134 |
+
"""Return a dictionary with all available teams.
|
135 |
+
|
136 |
+
Returns
|
137 |
+
-------
|
138 |
+
dict
|
139 |
+
A mapping between team IDs and the information available about
|
140 |
+
each team in the data stream.
|
141 |
+
"""
|
142 |
+
teams = {}
|
143 |
+
for side in [self.root["home"], self.root["away"]]:
|
144 |
+
team_id = int(assertget(side, "teamId"))
|
145 |
+
teams[team_id] = {
|
146 |
+
# Fields required by the base schema
|
147 |
+
"team_id": team_id,
|
148 |
+
"team_name": assertget(side, "name"),
|
149 |
+
}
|
150 |
+
return teams
|
151 |
+
|
152 |
+
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
|
153 |
+
"""Return a dictionary with all available players.
|
154 |
+
|
155 |
+
Returns
|
156 |
+
-------
|
157 |
+
dict
|
158 |
+
A mapping between (game ID, player ID) tuples and the information
|
159 |
+
available about each player in the data stream.
|
160 |
+
"""
|
161 |
+
game_id = self.game_id
|
162 |
+
player_gamestats = self.extract_playergamestats()
|
163 |
+
players = {}
|
164 |
+
for team in [self.root["home"], self.root["away"]]:
|
165 |
+
team_id = int(assertget(team, "teamId"))
|
166 |
+
for p in team["players"]:
|
167 |
+
player_id = int(assertget(p, "playerId"))
|
168 |
+
players[(game_id, player_id)] = {
|
169 |
+
# Fields required by the base schema
|
170 |
+
"game_id": game_id,
|
171 |
+
"team_id": team_id,
|
172 |
+
"player_id": player_id,
|
173 |
+
"player_name": assertget(p, "name"),
|
174 |
+
"is_starter": bool(p.get("isFirstEleven", False)),
|
175 |
+
"minutes_played": player_gamestats[(game_id, player_id)]["minutes_played"],
|
176 |
+
"jersey_number": player_gamestats[(game_id, player_id)]["jersey_number"],
|
177 |
+
# Fields required by the opta schema
|
178 |
+
"starting_position": player_gamestats[(game_id, player_id)]["position_code"],
|
179 |
+
# Optional fields
|
180 |
+
# WhoScored retrieves player details for the current date,
|
181 |
+
# not for the game date. Hence, we do not innclude this
|
182 |
+
# info.
|
183 |
+
# age=int(p["age"]),
|
184 |
+
# height=float(p.get("height", float("NaN"))),
|
185 |
+
# weight=float(p.get("weight", float("NaN"))),
|
186 |
+
}
|
187 |
+
return players
|
188 |
+
|
189 |
+
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
|
190 |
+
"""Return a dictionary with all available events.
|
191 |
+
|
192 |
+
Returns
|
193 |
+
-------
|
194 |
+
dict
|
195 |
+
A mapping between (game ID, event ID) tuples and the information
|
196 |
+
available about each event in the data stream.
|
197 |
+
"""
|
198 |
+
events = {}
|
199 |
+
|
200 |
+
time_start_str = assertget(self.root, "startTime")
|
201 |
+
time_start = datetime.strptime(time_start_str, "%Y-%m-%dT%H:%M:%S")
|
202 |
+
for attr in self.root["events"]:
|
203 |
+
event_id = int(assertget(attr, "id" if "id" in attr else "eventId"))
|
204 |
+
eventtype = attr.get("type", {})
|
205 |
+
start_x = float(assertget(attr, "x"))
|
206 |
+
start_y = float(assertget(attr, "y"))
|
207 |
+
minute = int(assertget(attr, "expandedMinute"))
|
208 |
+
second = int(attr.get("second", 0))
|
209 |
+
qualifiers = {
|
210 |
+
int(q["type"]["value"]): q.get("value", True) for q in attr.get("qualifiers", [])
|
211 |
+
}
|
212 |
+
end_x = attr.get("endX", _get_end_x(qualifiers))
|
213 |
+
end_y = attr.get("endY", _get_end_y(qualifiers))
|
214 |
+
events[(self.game_id, event_id)] = {
|
215 |
+
# Fields required by the base schema
|
216 |
+
"game_id": self.game_id,
|
217 |
+
"event_id": event_id,
|
218 |
+
"period_id": self._get_period_id(attr),
|
219 |
+
"team_id": int(assertget(attr, "teamId")),
|
220 |
+
"player_id": int(attr.get("playerId")) if "playerId" in attr else None,
|
221 |
+
"type_id": int(assertget(eventtype, "value")),
|
222 |
+
# type_name=assertget(eventtype, "displayName"), # added in the opta loader
|
223 |
+
# Fields required by the opta schema
|
224 |
+
# Timestamp is not availe in the data stream. The returned
|
225 |
+
# timestamp is not accurate, but sufficient for camptability
|
226 |
+
# with the other Opta data streams.
|
227 |
+
"timestamp": (time_start + timedelta(seconds=(minute * 60 + second))),
|
228 |
+
"minute": minute,
|
229 |
+
"second": second,
|
230 |
+
"outcome": bool(attr["outcomeType"].get("value"))
|
231 |
+
if "outcomeType" in attr
|
232 |
+
else None,
|
233 |
+
"start_x": start_x,
|
234 |
+
"start_y": start_y,
|
235 |
+
"end_x": end_x if end_x is not None else start_x,
|
236 |
+
"end_y": end_y if end_y is not None else start_y,
|
237 |
+
"qualifiers": qualifiers,
|
238 |
+
# Optional fields
|
239 |
+
"related_player_id": int(attr.get("relatedPlayerId"))
|
240 |
+
if "relatedPlayerId" in attr
|
241 |
+
else None,
|
242 |
+
"touch": bool(attr.get("isTouch", False)),
|
243 |
+
"goal": bool(attr.get("isGoal", False)),
|
244 |
+
"shot": bool(attr.get("isShot", False)),
|
245 |
+
# assist=bool(attr.get('assist')) if "assist" in attr else None,
|
246 |
+
# keypass=bool(attr.get('keypass')) if "keypass" in attr else None,
|
247 |
+
}
|
248 |
+
|
249 |
+
return events
|
250 |
+
|
251 |
+
def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]:
|
252 |
+
"""Return a dictionary with all substitution events.
|
253 |
+
|
254 |
+
Returns
|
255 |
+
-------
|
256 |
+
dict
|
257 |
+
A mapping between (game ID, player ID) tuples and the information
|
258 |
+
available about each substitution in the data stream.
|
259 |
+
"""
|
260 |
+
subs = {}
|
261 |
+
subonevents = [e for e in self.root["events"] if e["type"].get("value") == 19]
|
262 |
+
for e in subonevents:
|
263 |
+
sub_id = int(assertget(e, "playerId"))
|
264 |
+
sub = {
|
265 |
+
"game_id": self.game_id,
|
266 |
+
"team_id": int(assertget(e, "teamId")),
|
267 |
+
"period_id": self._get_period_id(e),
|
268 |
+
"period_milliseconds": self._get_period_milliseconds(e),
|
269 |
+
"player_in_id": int(assertget(e, "playerId")),
|
270 |
+
"player_out_id": int(assertget(e, "relatedPlayerId")),
|
271 |
+
}
|
272 |
+
subs[(self.game_id, sub_id)] = sub
|
273 |
+
return subs
|
274 |
+
|
275 |
+
def extract_positions(self) -> dict[tuple[int, int, int], dict[str, Any]]: # noqa: C901
|
276 |
+
"""Return a dictionary with each player's position during a game.
|
277 |
+
|
278 |
+
Returns
|
279 |
+
-------
|
280 |
+
dict
|
281 |
+
A mapping between (game ID, player ID, epoch ID) tuples and the
|
282 |
+
information available about each player's position in the data stream.
|
283 |
+
"""
|
284 |
+
positions = {}
|
285 |
+
for t in [self.root["home"], self.root["away"]]:
|
286 |
+
team_id = int(assertget(t, "teamId"))
|
287 |
+
for f in assertget(t, "formations"):
|
288 |
+
fpositions = assertget(f, "formationPositions")
|
289 |
+
playersIds = assertget(f, "playerIds")
|
290 |
+
formation = assertget(f, "formationName")
|
291 |
+
|
292 |
+
period_end_minutes = assertget(self.root, "periodEndMinutes")
|
293 |
+
period_minute_limits = assertget(self.root, "periodMinuteLimits")
|
294 |
+
start_minute = int(assertget(f, "startMinuteExpanded"))
|
295 |
+
end_minute = int(assertget(f, "endMinuteExpanded"))
|
296 |
+
for period_id in sorted(period_end_minutes.keys()):
|
297 |
+
if period_end_minutes[period_id] > start_minute:
|
298 |
+
break
|
299 |
+
period_id = int(period_id)
|
300 |
+
period_minute = start_minute
|
301 |
+
if period_id > 1:
|
302 |
+
period_minute = start_minute - period_minute_limits[str(period_id - 1)]
|
303 |
+
|
304 |
+
for i, p in enumerate(fpositions):
|
305 |
+
player_id = int(playersIds[i])
|
306 |
+
x = float(assertget(p, "vertical"))
|
307 |
+
y = float(assertget(p, "horizontal"))
|
308 |
+
position_code = _position_mapping(formation, x, y)
|
309 |
+
positions[(self.game_id, player_id, start_minute)] = {
|
310 |
+
"game_id": self.game_id,
|
311 |
+
"team_id": team_id,
|
312 |
+
"player_id": player_id,
|
313 |
+
"period_id": period_id,
|
314 |
+
"period_milliseconds": (period_minute * 60 * 1000),
|
315 |
+
"start_milliseconds": (start_minute * 60 * 1000),
|
316 |
+
"end_milliseconds": (end_minute * 60 * 1000),
|
317 |
+
"formation_scheme": formation,
|
318 |
+
"player_position": position_code,
|
319 |
+
"player_position_x": x,
|
320 |
+
"player_position_y": y,
|
321 |
+
}
|
322 |
+
return positions
|
323 |
+
|
324 |
+
def extract_teamgamestats(self) -> dict[tuple[int, int], dict[str, Any]]:
|
325 |
+
"""Return some aggregated statistics of each team in a game.
|
326 |
+
|
327 |
+
Returns
|
328 |
+
-------
|
329 |
+
list(dict)
|
330 |
+
A dictionary with aggregated team statistics for each team.
|
331 |
+
"""
|
332 |
+
teams_gamestats = {}
|
333 |
+
teams = [self.root["home"], self.root["away"]]
|
334 |
+
for team in teams:
|
335 |
+
team_id = int(assertget(team, "teamId"))
|
336 |
+
statsdict = {}
|
337 |
+
for name in team["stats"]:
|
338 |
+
if isinstance(team["stats"][name], dict):
|
339 |
+
statsdict[_camel_to_snake(name)] = sum(team["stats"][name].values())
|
340 |
+
|
341 |
+
scores = assertget(team, "scores")
|
342 |
+
teams_gamestats[(self.game_id, team_id)] = dict(
|
343 |
+
game_id=self.game_id,
|
344 |
+
team_id=team_id,
|
345 |
+
side=assertget(team, "field"),
|
346 |
+
score=assertget(scores, "fulltime"),
|
347 |
+
shootout_score=scores.get("penalty"),
|
348 |
+
**{k: statsdict[k] for k in statsdict if not k.endswith("Success")},
|
349 |
+
)
|
350 |
+
|
351 |
+
return teams_gamestats
|
352 |
+
|
353 |
+
def extract_playergamestats(self) -> dict[tuple[int, int], dict[str, Any]]: # noqa: C901
|
354 |
+
"""Return some aggregated statistics of each player in a game.
|
355 |
+
|
356 |
+
Returns
|
357 |
+
-------
|
358 |
+
dict(dict)
|
359 |
+
A dictionary with aggregated team statistics for each player.
|
360 |
+
"""
|
361 |
+
players_gamestats = {}
|
362 |
+
for team in [self.root["home"], self.root["away"]]:
|
363 |
+
team_id = int(assertget(team, "teamId"))
|
364 |
+
red_cards = {
|
365 |
+
e["playerId"]: e["expandedMinute"]
|
366 |
+
for e in team.get("incidentEvents", [])
|
367 |
+
if "cardType" in e
|
368 |
+
and e["cardType"]["displayName"] in ["Red", "SecondYellow"]
|
369 |
+
and "playerId" in e # not defined if a coach receives a red card
|
370 |
+
}
|
371 |
+
for player in team["players"]:
|
372 |
+
statsdict = {
|
373 |
+
_camel_to_snake(name): sum(stat.values())
|
374 |
+
for name, stat in player["stats"].items()
|
375 |
+
}
|
376 |
+
stats = [k for k in statsdict if not k.endswith("success")]
|
377 |
+
|
378 |
+
player_id = int(assertget(player, "playerId"))
|
379 |
+
p = dict(
|
380 |
+
game_id=self.game_id,
|
381 |
+
team_id=team_id,
|
382 |
+
player_id=player_id,
|
383 |
+
is_starter=bool(player.get("isFirstEleven", False)),
|
384 |
+
position_code=player.get("position", None),
|
385 |
+
jersey_number=int(player.get("shirtNo", 0)),
|
386 |
+
mvp=bool(player.get("isManOfTheMatch", False)),
|
387 |
+
**{k: statsdict[k] for k in stats},
|
388 |
+
)
|
389 |
+
if "subbedInExpandedMinute" in player:
|
390 |
+
p["minute_start"] = player["subbedInExpandedMinute"]
|
391 |
+
if "subbedOutExpandedMinute" in player:
|
392 |
+
p["minute_end"] = player["subbedOutExpandedMinute"]
|
393 |
+
if player_id in red_cards:
|
394 |
+
p["minute_end"] = red_cards[player_id]
|
395 |
+
|
396 |
+
# Did not play
|
397 |
+
p["minutes_played"] = 0
|
398 |
+
# Played the full game
|
399 |
+
if p["is_starter"] and "minute_end" not in p:
|
400 |
+
p["minute_start"] = 0
|
401 |
+
p["minute_end"] = self.root["expandedMaxMinute"]
|
402 |
+
p["minutes_played"] = self.root["expandedMaxMinute"]
|
403 |
+
# Started but substituted out
|
404 |
+
elif p["is_starter"] and "minute_end" in p:
|
405 |
+
p["minute_start"] = 0
|
406 |
+
p["minutes_played"] = p["minute_end"]
|
407 |
+
# Substitud in and played the remainder of the game
|
408 |
+
elif "minute_start" in p and "minute_end" not in p:
|
409 |
+
p["minute_end"] = self.root["expandedMaxMinute"]
|
410 |
+
p["minutes_played"] = self.root["expandedMaxMinute"] - p["minute_start"]
|
411 |
+
# Substitud in and out
|
412 |
+
elif "minute_start" in p and "minute_end" in p:
|
413 |
+
p["minutes_played"] = p["minute_end"] - p["minute_start"]
|
414 |
+
|
415 |
+
players_gamestats[(self.game_id, player_id)] = p
|
416 |
+
return players_gamestats
|
417 |
+
|
418 |
+
|
419 |
+
def _camel_to_snake(name: str) -> str:
|
420 |
+
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
|
421 |
+
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
|
data/opta/schema.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SPADL schema for Opta data."""
|
2 |
+
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import pandera as pa
|
7 |
+
from pandera.typing import DateTime, Object, Series
|
8 |
+
|
9 |
+
from socceraction.data.schema import (
|
10 |
+
CompetitionSchema,
|
11 |
+
EventSchema,
|
12 |
+
GameSchema,
|
13 |
+
PlayerSchema,
|
14 |
+
TeamSchema,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
class OptaCompetitionSchema(CompetitionSchema):
|
19 |
+
"""Definition of a dataframe containing a list of competitions and seasons."""
|
20 |
+
|
21 |
+
|
22 |
+
class OptaGameSchema(GameSchema):
|
23 |
+
"""Definition of a dataframe containing a list of games."""
|
24 |
+
|
25 |
+
home_score: Optional[Series[int]] = pa.Field(nullable=True)
|
26 |
+
"""The final score of the home team."""
|
27 |
+
away_score: Optional[Series[int]] = pa.Field(nullable=True)
|
28 |
+
"""The final score of the away team."""
|
29 |
+
duration: Optional[Series[int]] = pa.Field(nullable=True)
|
30 |
+
"""The total duration of the game in minutes."""
|
31 |
+
referee: Optional[Series[str]] = pa.Field(nullable=True)
|
32 |
+
"""The name of the referee."""
|
33 |
+
venue: Optional[Series[str]] = pa.Field(nullable=True)
|
34 |
+
"""The name of the stadium where the game was played."""
|
35 |
+
attendance: Optional[Series[int]] = pa.Field(nullable=True)
|
36 |
+
"""The number of people who attended the game."""
|
37 |
+
home_manager: Optional[Series[str]] = pa.Field(nullable=True)
|
38 |
+
"""The name of the manager of the home team."""
|
39 |
+
away_manager: Optional[Series[str]] = pa.Field(nullable=True)
|
40 |
+
"""The name of the manager of the away team."""
|
41 |
+
|
42 |
+
|
43 |
+
class OptaPlayerSchema(PlayerSchema):
|
44 |
+
"""Definition of a dataframe containing the list of players of a game."""
|
45 |
+
|
46 |
+
starting_position: Series[str]
|
47 |
+
"""The starting position of the player."""
|
48 |
+
|
49 |
+
|
50 |
+
class OptaTeamSchema(TeamSchema):
|
51 |
+
"""Definition of a dataframe containing the list of teams of a game."""
|
52 |
+
|
53 |
+
|
54 |
+
class OptaEventSchema(EventSchema):
|
55 |
+
"""Definition of a dataframe containing event stream data of a game."""
|
56 |
+
|
57 |
+
timestamp: Series[DateTime]
|
58 |
+
"""Time in the match the event takes place, recorded to the millisecond."""
|
59 |
+
minute: Series[int]
|
60 |
+
"""The minutes on the clock at the time of this event."""
|
61 |
+
second: Series[int] = pa.Field(ge=0, le=59)
|
62 |
+
"""The second part of the timestamp."""
|
63 |
+
outcome: Series[bool]
|
64 |
+
"""Whether the event had a successful outcome or not."""
|
65 |
+
start_x: Series[float] = pa.Field(nullable=True)
|
66 |
+
"""The x coordinate of the location where the event started."""
|
67 |
+
start_y: Series[float] = pa.Field(nullable=True)
|
68 |
+
"""The y coordinate of the location where the event started."""
|
69 |
+
end_x: Series[float] = pa.Field(nullable=True)
|
70 |
+
"""The x coordinate of the location where the event ended."""
|
71 |
+
end_y: Series[float] = pa.Field(nullable=True)
|
72 |
+
"""The y coordinate of the location where the event ended."""
|
73 |
+
qualifiers: Series[Object]
|
74 |
+
"""A JSON object containing the Opta qualifiers of the event."""
|
75 |
+
assist: Optional[Series[bool]]
|
76 |
+
"""Whether the event was an assist or not."""
|
77 |
+
keypass: Optional[Series[bool]]
|
78 |
+
"""Whether the event was a keypass or not."""
|
79 |
+
goal: Optional[Series[bool]]
|
80 |
+
"""Whether the event was a goal or not."""
|
81 |
+
shot: Optional[Series[bool]]
|
82 |
+
"""Whether the event was a shot or not."""
|
83 |
+
touch: Optional[Series[bool]]
|
84 |
+
"""Whether the event was a on-the-ball action or not."""
|
85 |
+
related_player_id: Optional[Series[pd.Int64Dtype]] = pa.Field(nullable=True)
|
86 |
+
"""The ID of a second player that was involved in this event."""
|
data/schema.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Base schemas used by all event stream serializers.
|
2 |
+
|
3 |
+
Each serializer should create dataframes that contain at least the fields
|
4 |
+
included in these base schemas. Each serializer can add different additional
|
5 |
+
fields on top.
|
6 |
+
|
7 |
+
"""
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import pandera as pa
|
11 |
+
from pandera.typing import DateTime, Object, Series
|
12 |
+
|
13 |
+
|
14 |
+
class CompetitionSchema(pa.SchemaModel):
|
15 |
+
"""Definition of a dataframe containing a list of competitions and seasons."""
|
16 |
+
|
17 |
+
season_id: Series[Object] = pa.Field()
|
18 |
+
"""The unique identifier for the season."""
|
19 |
+
season_name: Series[str] = pa.Field()
|
20 |
+
"""The name of the season."""
|
21 |
+
competition_id: Series[Object] = pa.Field()
|
22 |
+
"""The unique identifier for the competition."""
|
23 |
+
competition_name: Series[str] = pa.Field()
|
24 |
+
"""The name of the competition."""
|
25 |
+
|
26 |
+
class Config: # noqa: D106
|
27 |
+
strict = True
|
28 |
+
coerce = True
|
29 |
+
|
30 |
+
|
31 |
+
class GameSchema(pa.SchemaModel):
|
32 |
+
"""Definition of a dataframe containing a list of games."""
|
33 |
+
|
34 |
+
game_id: Series[Object] = pa.Field()
|
35 |
+
"""The unique identifier for the game."""
|
36 |
+
season_id: Series[Object] = pa.Field()
|
37 |
+
"""The unique identifier for the season."""
|
38 |
+
competition_id: Series[Object] = pa.Field()
|
39 |
+
"""The unique identifier for the competition."""
|
40 |
+
game_day: Series[pd.Int64Dtype] = pa.Field(nullable=True)
|
41 |
+
"""Number corresponding to the weeks or rounds into the competition this game is."""
|
42 |
+
game_date: Series[DateTime] = pa.Field()
|
43 |
+
"""The date when the game was played."""
|
44 |
+
home_team_id: Series[Object] = pa.Field()
|
45 |
+
"""The unique identifier for the home team in this game."""
|
46 |
+
away_team_id: Series[Object] = pa.Field()
|
47 |
+
"""The unique identifier for the away team in this game."""
|
48 |
+
|
49 |
+
class Config: # noqa: D106
|
50 |
+
strict = True
|
51 |
+
coerce = True
|
52 |
+
|
53 |
+
|
54 |
+
class TeamSchema(pa.SchemaModel):
|
55 |
+
"""Definition of a dataframe containing the list of teams of a game."""
|
56 |
+
|
57 |
+
team_id: Series[Object] = pa.Field()
|
58 |
+
"""The unique identifier for the team."""
|
59 |
+
team_name: Series[str] = pa.Field()
|
60 |
+
"""The name of the team."""
|
61 |
+
|
62 |
+
class Config: # noqa: D106
|
63 |
+
strict = True
|
64 |
+
coerce = True
|
65 |
+
|
66 |
+
|
67 |
+
class PlayerSchema(pa.SchemaModel):
|
68 |
+
"""Definition of a dataframe containing the list of players on the teamsheet of a game."""
|
69 |
+
|
70 |
+
game_id: Series[Object] = pa.Field()
|
71 |
+
"""The unique identifier for the game."""
|
72 |
+
team_id: Series[Object] = pa.Field()
|
73 |
+
"""The unique identifier for the player's team."""
|
74 |
+
player_id: Series[Object] = pa.Field()
|
75 |
+
"""The unique identifier for the player."""
|
76 |
+
player_name: Series[str] = pa.Field()
|
77 |
+
"""The name of the player."""
|
78 |
+
is_starter: Series[bool] = pa.Field()
|
79 |
+
"""Whether the player is in the starting lineup."""
|
80 |
+
minutes_played: Series[int] = pa.Field()
|
81 |
+
"""The number of minutes the player played in the game."""
|
82 |
+
jersey_number: Series[int] = pa.Field()
|
83 |
+
"""The player's jersey number."""
|
84 |
+
|
85 |
+
class Config: # noqa: D106
|
86 |
+
strict = True
|
87 |
+
coerce = True
|
88 |
+
|
89 |
+
|
90 |
+
class EventSchema(pa.SchemaModel):
|
91 |
+
"""Definition of a dataframe containing event stream data of a game."""
|
92 |
+
|
93 |
+
game_id: Series[Object] = pa.Field()
|
94 |
+
"""The unique identifier for the game."""
|
95 |
+
event_id: Series[Object] = pa.Field()
|
96 |
+
"""The unique identifier for the event."""
|
97 |
+
period_id: Series[int] = pa.Field()
|
98 |
+
"""The unique identifier for the part of the game in which the event took place."""
|
99 |
+
team_id: Series[Object] = pa.Field(nullable=True)
|
100 |
+
"""The unique identifier for the team this event relates to."""
|
101 |
+
player_id: Series[Object] = pa.Field(nullable=True)
|
102 |
+
"""The unique identifier for the player this event relates to."""
|
103 |
+
type_id: Series[int] = pa.Field()
|
104 |
+
"""The unique identifier for the type of this event."""
|
105 |
+
type_name: Series[str] = pa.Field()
|
106 |
+
"""The name of the type of this event."""
|
107 |
+
|
108 |
+
class Config: # noqa: D106
|
109 |
+
strict = True
|
110 |
+
coerce = True
|
data/statsbomb/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for loading StatsBomb event data."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"StatsBombLoader",
|
5 |
+
"extract_player_games",
|
6 |
+
"StatsBombCompetitionSchema",
|
7 |
+
"StatsBombGameSchema",
|
8 |
+
"StatsBombPlayerSchema",
|
9 |
+
"StatsBombTeamSchema",
|
10 |
+
"StatsBombEventSchema",
|
11 |
+
]
|
12 |
+
|
13 |
+
from .loader import StatsBombLoader, extract_player_games
|
14 |
+
from .schema import (
|
15 |
+
StatsBombCompetitionSchema,
|
16 |
+
StatsBombEventSchema,
|
17 |
+
StatsBombGameSchema,
|
18 |
+
StatsBombPlayerSchema,
|
19 |
+
StatsBombTeamSchema,
|
20 |
+
)
|
data/statsbomb/loader.py
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements serializers for StatsBomb data."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
from typing import Any, Optional, cast
|
5 |
+
|
6 |
+
import pandas as pd # type: ignore
|
7 |
+
from pandera.typing import DataFrame
|
8 |
+
|
9 |
+
try:
|
10 |
+
from statsbombpy import sb
|
11 |
+
except ImportError:
|
12 |
+
sb = None
|
13 |
+
|
14 |
+
from socceraction.data.base import (
|
15 |
+
EventDataLoader,
|
16 |
+
ParseError,
|
17 |
+
_expand_minute,
|
18 |
+
_localloadjson,
|
19 |
+
)
|
20 |
+
|
21 |
+
from .schema import (
|
22 |
+
StatsBombCompetitionSchema,
|
23 |
+
StatsBombEventSchema,
|
24 |
+
StatsBombGameSchema,
|
25 |
+
StatsBombPlayerSchema,
|
26 |
+
StatsBombTeamSchema,
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
class StatsBombLoader(EventDataLoader):
|
31 |
+
"""Load Statsbomb data either from a remote location or from a local folder.
|
32 |
+
|
33 |
+
To load remote data, this loader uses the `statsbombpy
|
34 |
+
<https://github.com/statsbomb/statsbombpy>`__ package. Data can be retrieved
|
35 |
+
from the StatsBomb API and from the `Open Data GitHub repo
|
36 |
+
<https://github.com/statsbomb/open-data/>`__.
|
37 |
+
API access is for paying customers only. Authentication can be done by
|
38 |
+
setting environment variables named ``SB_USERNAME`` and ``SB_PASSWORD`` to
|
39 |
+
your login credentials. Alternatively, pass your login credentials using
|
40 |
+
the ``creds`` parameter.
|
41 |
+
StatsBomb's open data can be accessed without the need of authentication
|
42 |
+
but its use is subject to a `user agreement
|
43 |
+
<https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf>`__.
|
44 |
+
|
45 |
+
To load local data, point ``root`` to the root folder of the data. This folder
|
46 |
+
should use the same directory structure as used in the Open Data GitHub repo.
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
----------
|
50 |
+
getter : str
|
51 |
+
"remote" or "local"
|
52 |
+
root : str, optional
|
53 |
+
Root-path of the data. Only used when getter is "local".
|
54 |
+
creds: dict, optional
|
55 |
+
Login credentials in the format {"user": "", "passwd": ""}. Only used
|
56 |
+
when getter is "remote".
|
57 |
+
"""
|
58 |
+
|
59 |
+
def __init__(
|
60 |
+
self,
|
61 |
+
getter: str = "remote",
|
62 |
+
root: Optional[str] = None,
|
63 |
+
creds: Optional[dict[str, str]] = None,
|
64 |
+
) -> None:
|
65 |
+
if getter == "remote":
|
66 |
+
if sb is None:
|
67 |
+
raise ImportError(
|
68 |
+
"""The 'statsbombpy' package is required. Install with 'pip install statsbombpy'."""
|
69 |
+
)
|
70 |
+
self._creds = creds or sb.DEFAULT_CREDS
|
71 |
+
self._local = False
|
72 |
+
elif getter == "local":
|
73 |
+
if root is None:
|
74 |
+
raise ValueError("""The 'root' parameter is required when loading local data.""")
|
75 |
+
self._local = True
|
76 |
+
self._root = root
|
77 |
+
else:
|
78 |
+
raise ValueError("Invalid getter specified")
|
79 |
+
|
80 |
+
def competitions(self) -> DataFrame[StatsBombCompetitionSchema]:
|
81 |
+
"""Return a dataframe with all available competitions and seasons.
|
82 |
+
|
83 |
+
Raises
|
84 |
+
------
|
85 |
+
ParseError
|
86 |
+
When the raw data does not adhere to the expected format.
|
87 |
+
|
88 |
+
Returns
|
89 |
+
-------
|
90 |
+
pd.DataFrame
|
91 |
+
A dataframe containing all available competitions and seasons. See
|
92 |
+
:class:`~socceraction.spadl.statsbomb.StatsBombCompetitionSchema` for the schema.
|
93 |
+
"""
|
94 |
+
cols = [
|
95 |
+
"season_id",
|
96 |
+
"competition_id",
|
97 |
+
"competition_name",
|
98 |
+
"country_name",
|
99 |
+
"competition_gender",
|
100 |
+
"season_name",
|
101 |
+
]
|
102 |
+
if self._local:
|
103 |
+
obj = _localloadjson(str(os.path.join(self._root, "competitions.json")))
|
104 |
+
else:
|
105 |
+
obj = list(sb.competitions(fmt="dict", creds=self._creds).values())
|
106 |
+
if not isinstance(obj, list):
|
107 |
+
raise ParseError("The retrieved data should contain a list of competitions")
|
108 |
+
if len(obj) == 0:
|
109 |
+
return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(columns=cols))
|
110 |
+
return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(obj)[cols])
|
111 |
+
|
112 |
+
def games(self, competition_id: int, season_id: int) -> DataFrame[StatsBombGameSchema]:
|
113 |
+
"""Return a dataframe with all available games in a season.
|
114 |
+
|
115 |
+
Parameters
|
116 |
+
----------
|
117 |
+
competition_id : int
|
118 |
+
The ID of the competition.
|
119 |
+
season_id : int
|
120 |
+
The ID of the season.
|
121 |
+
|
122 |
+
Raises
|
123 |
+
------
|
124 |
+
ParseError
|
125 |
+
When the raw data does not adhere to the expected format.
|
126 |
+
|
127 |
+
Returns
|
128 |
+
-------
|
129 |
+
pd.DataFrame
|
130 |
+
A dataframe containing all available games. See
|
131 |
+
:class:`~socceraction.spadl.statsbomb.StatsBombGameSchema` for the schema.
|
132 |
+
"""
|
133 |
+
cols = [
|
134 |
+
"game_id",
|
135 |
+
"season_id",
|
136 |
+
"competition_id",
|
137 |
+
"competition_stage",
|
138 |
+
"game_day",
|
139 |
+
"game_date",
|
140 |
+
"home_team_id",
|
141 |
+
"away_team_id",
|
142 |
+
"home_score",
|
143 |
+
"away_score",
|
144 |
+
"venue",
|
145 |
+
"referee",
|
146 |
+
]
|
147 |
+
if self._local:
|
148 |
+
obj = _localloadjson(
|
149 |
+
str(os.path.join(self._root, "matches", f"{competition_id}", f"{season_id}.json"))
|
150 |
+
)
|
151 |
+
else:
|
152 |
+
obj = list(
|
153 |
+
sb.matches(competition_id, season_id, fmt="dict", creds=self._creds).values()
|
154 |
+
)
|
155 |
+
if not isinstance(obj, list):
|
156 |
+
raise ParseError("The retrieved data should contain a list of games")
|
157 |
+
if len(obj) == 0:
|
158 |
+
return cast(DataFrame[StatsBombGameSchema], pd.DataFrame(columns=cols))
|
159 |
+
gamesdf = pd.DataFrame(_flatten(m) for m in obj)
|
160 |
+
gamesdf["kick_off"] = gamesdf["kick_off"].fillna("12:00:00.000")
|
161 |
+
gamesdf["match_date"] = pd.to_datetime(
|
162 |
+
gamesdf[["match_date", "kick_off"]].agg(" ".join, axis=1)
|
163 |
+
)
|
164 |
+
gamesdf.rename(
|
165 |
+
columns={
|
166 |
+
"match_id": "game_id",
|
167 |
+
"match_date": "game_date",
|
168 |
+
"match_week": "game_day",
|
169 |
+
"stadium_name": "venue",
|
170 |
+
"referee_name": "referee",
|
171 |
+
"competition_stage_name": "competition_stage",
|
172 |
+
},
|
173 |
+
inplace=True,
|
174 |
+
)
|
175 |
+
if "venue" not in gamesdf:
|
176 |
+
gamesdf["venue"] = None
|
177 |
+
if "referee" not in gamesdf:
|
178 |
+
gamesdf["referee"] = None
|
179 |
+
return cast(DataFrame[StatsBombGameSchema], gamesdf[cols])
|
180 |
+
|
181 |
+
def _lineups(self, game_id: int) -> list[dict[str, Any]]:
|
182 |
+
if self._local:
|
183 |
+
obj = _localloadjson(str(os.path.join(self._root, "lineups", f"{game_id}.json")))
|
184 |
+
else:
|
185 |
+
obj = list(sb.lineups(game_id, fmt="dict", creds=self._creds).values())
|
186 |
+
if not isinstance(obj, list):
|
187 |
+
raise ParseError("The retrieved data should contain a list of teams")
|
188 |
+
if len(obj) != 2:
|
189 |
+
raise ParseError("The retrieved data should contain two teams")
|
190 |
+
return obj
|
191 |
+
|
192 |
+
def teams(self, game_id: int) -> DataFrame[StatsBombTeamSchema]:
|
193 |
+
"""Return a dataframe with both teams that participated in a game.
|
194 |
+
|
195 |
+
Parameters
|
196 |
+
----------
|
197 |
+
game_id : int
|
198 |
+
The ID of the game.
|
199 |
+
|
200 |
+
Raises
|
201 |
+
------
|
202 |
+
ParseError # noqa: DAR402
|
203 |
+
When the raw data does not adhere to the expected format.
|
204 |
+
|
205 |
+
Returns
|
206 |
+
-------
|
207 |
+
pd.DataFrame
|
208 |
+
A dataframe containing both teams. See
|
209 |
+
:class:`~socceraction.spadl.statsbomb.StatsBombTeamSchema` for the schema.
|
210 |
+
"""
|
211 |
+
cols = ["team_id", "team_name"]
|
212 |
+
obj = self._lineups(game_id)
|
213 |
+
return cast(DataFrame[StatsBombTeamSchema], pd.DataFrame(obj)[cols])
|
214 |
+
|
215 |
+
def players(self, game_id: int) -> DataFrame[StatsBombPlayerSchema]:
|
216 |
+
"""Return a dataframe with all players that participated in a game.
|
217 |
+
|
218 |
+
Parameters
|
219 |
+
----------
|
220 |
+
game_id : int
|
221 |
+
The ID of the game.
|
222 |
+
|
223 |
+
Raises
|
224 |
+
------
|
225 |
+
ParseError # noqa: DAR402
|
226 |
+
When the raw data does not adhere to the expected format.
|
227 |
+
|
228 |
+
Returns
|
229 |
+
-------
|
230 |
+
pd.DataFrame
|
231 |
+
A dataframe containing all players. See
|
232 |
+
:class:`~socceraction.spadl.statsbomb.StatsBombPlayerSchema` for the schema.
|
233 |
+
"""
|
234 |
+
cols = [
|
235 |
+
"game_id",
|
236 |
+
"team_id",
|
237 |
+
"player_id",
|
238 |
+
"player_name",
|
239 |
+
"nickname",
|
240 |
+
"jersey_number",
|
241 |
+
"is_starter",
|
242 |
+
"starting_position_id",
|
243 |
+
"starting_position_name",
|
244 |
+
"minutes_played",
|
245 |
+
]
|
246 |
+
|
247 |
+
obj = self._lineups(game_id)
|
248 |
+
playersdf = pd.DataFrame(_flatten_id(p) for lineup in obj for p in lineup["lineup"])
|
249 |
+
playergamesdf = extract_player_games(self.events(game_id))
|
250 |
+
playersdf = pd.merge(
|
251 |
+
playersdf,
|
252 |
+
playergamesdf[
|
253 |
+
["player_id", "team_id", "position_id", "position_name", "minutes_played"]
|
254 |
+
],
|
255 |
+
on="player_id",
|
256 |
+
)
|
257 |
+
playersdf["game_id"] = game_id
|
258 |
+
playersdf["position_name"] = playersdf["position_name"].replace(0, "Substitute")
|
259 |
+
playersdf["position_id"] = playersdf["position_id"].fillna(0).astype(int)
|
260 |
+
playersdf["is_starter"] = playersdf["position_id"] != 0
|
261 |
+
playersdf.rename(
|
262 |
+
columns={
|
263 |
+
"player_nickname": "nickname",
|
264 |
+
"country_name": "country",
|
265 |
+
"position_id": "starting_position_id",
|
266 |
+
"position_name": "starting_position_name",
|
267 |
+
},
|
268 |
+
inplace=True,
|
269 |
+
)
|
270 |
+
return cast(DataFrame[StatsBombPlayerSchema], playersdf[cols])
|
271 |
+
|
272 |
+
def events(self, game_id: int, load_360: bool = False) -> DataFrame[StatsBombEventSchema]:
|
273 |
+
"""Return a dataframe with the event stream of a game.
|
274 |
+
|
275 |
+
Parameters
|
276 |
+
----------
|
277 |
+
game_id : int
|
278 |
+
The ID of the game.
|
279 |
+
load_360 : bool
|
280 |
+
Whether to load the 360 data.
|
281 |
+
|
282 |
+
Raises
|
283 |
+
------
|
284 |
+
ParseError
|
285 |
+
When the raw data does not adhere to the expected format.
|
286 |
+
|
287 |
+
Returns
|
288 |
+
-------
|
289 |
+
pd.DataFrame
|
290 |
+
A dataframe containing the event stream. See
|
291 |
+
:class:`~socceraction.spadl.statsbomb.StatsBombEventSchema` for the schema.
|
292 |
+
"""
|
293 |
+
cols = [
|
294 |
+
"game_id",
|
295 |
+
"event_id",
|
296 |
+
"period_id",
|
297 |
+
"team_id",
|
298 |
+
"player_id",
|
299 |
+
"type_id",
|
300 |
+
"type_name",
|
301 |
+
"index",
|
302 |
+
"timestamp",
|
303 |
+
"minute",
|
304 |
+
"second",
|
305 |
+
"possession",
|
306 |
+
"possession_team_id",
|
307 |
+
"possession_team_name",
|
308 |
+
"play_pattern_id",
|
309 |
+
"play_pattern_name",
|
310 |
+
"team_name",
|
311 |
+
"duration",
|
312 |
+
"extra",
|
313 |
+
"related_events",
|
314 |
+
"player_name",
|
315 |
+
"position_id",
|
316 |
+
"position_name",
|
317 |
+
"location",
|
318 |
+
"under_pressure",
|
319 |
+
"counterpress",
|
320 |
+
]
|
321 |
+
# Load the events
|
322 |
+
if self._local:
|
323 |
+
obj = _localloadjson(str(os.path.join(self._root, "events", f"{game_id}.json")))
|
324 |
+
else:
|
325 |
+
obj = list(sb.events(game_id, fmt="dict", creds=self._creds).values())
|
326 |
+
if not isinstance(obj, list):
|
327 |
+
raise ParseError("The retrieved data should contain a list of events")
|
328 |
+
if len(obj) == 0:
|
329 |
+
return cast(DataFrame[StatsBombEventSchema], pd.DataFrame(columns=cols))
|
330 |
+
|
331 |
+
eventsdf = pd.DataFrame(_flatten_id(e) for e in obj)
|
332 |
+
eventsdf["match_id"] = game_id
|
333 |
+
eventsdf["timestamp"] = pd.to_timedelta(eventsdf["timestamp"])
|
334 |
+
eventsdf["related_events"] = eventsdf["related_events"].apply(
|
335 |
+
lambda d: d if isinstance(d, list) else []
|
336 |
+
)
|
337 |
+
eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool)
|
338 |
+
eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool)
|
339 |
+
eventsdf.rename(
|
340 |
+
columns={"id": "event_id", "period": "period_id", "match_id": "game_id"},
|
341 |
+
inplace=True,
|
342 |
+
)
|
343 |
+
if not load_360:
|
344 |
+
return cast(DataFrame[StatsBombEventSchema], eventsdf[cols])
|
345 |
+
|
346 |
+
# Load the 360 data
|
347 |
+
cols_360 = ["visible_area_360", "freeze_frame_360"]
|
348 |
+
if self._local:
|
349 |
+
obj = _localloadjson(str(os.path.join(self._root, "three-sixty", f"{game_id}.json")))
|
350 |
+
else:
|
351 |
+
obj = sb.frames(game_id, fmt="dict", creds=self._creds)
|
352 |
+
if not isinstance(obj, list):
|
353 |
+
raise ParseError("The retrieved data should contain a list of frames")
|
354 |
+
if len(obj) == 0:
|
355 |
+
eventsdf["visible_area_360"] = None
|
356 |
+
eventsdf["freeze_frame_360"] = None
|
357 |
+
return cast(DataFrame[StatsBombEventSchema], eventsdf[cols + cols_360])
|
358 |
+
framesdf = pd.DataFrame(obj).rename(
|
359 |
+
columns={
|
360 |
+
"event_uuid": "event_id",
|
361 |
+
"visible_area": "visible_area_360",
|
362 |
+
"freeze_frame": "freeze_frame_360",
|
363 |
+
},
|
364 |
+
)[["event_id", "visible_area_360", "freeze_frame_360"]]
|
365 |
+
return cast(
|
366 |
+
DataFrame[StatsBombEventSchema],
|
367 |
+
pd.merge(eventsdf, framesdf, on="event_id", how="left")[cols + cols_360],
|
368 |
+
)
|
369 |
+
|
370 |
+
|
371 |
+
def extract_player_games(events: pd.DataFrame) -> pd.DataFrame:
|
372 |
+
"""Extract player games [player_id, game_id, minutes_played] from statsbomb match events.
|
373 |
+
|
374 |
+
Parameters
|
375 |
+
----------
|
376 |
+
events : pd.DataFrame
|
377 |
+
DataFrame containing StatsBomb events of a single game.
|
378 |
+
|
379 |
+
Returns
|
380 |
+
-------
|
381 |
+
player_games : pd.DataFrame
|
382 |
+
A DataFrame with the number of minutes played by each player during the game.
|
383 |
+
"""
|
384 |
+
# get duration of each period
|
385 |
+
periods = pd.DataFrame(
|
386 |
+
[
|
387 |
+
{"period_id": 1, "minute": 45},
|
388 |
+
{"period_id": 2, "minute": 45},
|
389 |
+
{"period_id": 3, "minute": 15},
|
390 |
+
{"period_id": 4, "minute": 15},
|
391 |
+
# Shoot-outs should not contritbute to minutes played
|
392 |
+
# {"period_id": 5, "minute": 0},
|
393 |
+
]
|
394 |
+
).set_index("period_id")
|
395 |
+
periods_minutes = (
|
396 |
+
events.loc[events.type_name == "Half End", ["period_id", "minute"]]
|
397 |
+
.drop_duplicates()
|
398 |
+
.set_index("period_id")
|
399 |
+
.sort_index()
|
400 |
+
.subtract(periods.cumsum().shift(1).fillna(0))
|
401 |
+
.minute.dropna()
|
402 |
+
.astype(int)
|
403 |
+
.tolist()
|
404 |
+
)
|
405 |
+
# get duration of entire match
|
406 |
+
game_minutes = sum(periods_minutes)
|
407 |
+
|
408 |
+
game_id = events.game_id.mode().values[0]
|
409 |
+
players = {}
|
410 |
+
# Red cards
|
411 |
+
red_cards = events[
|
412 |
+
events.apply(
|
413 |
+
lambda x: any(
|
414 |
+
e in x.extra
|
415 |
+
and "card" in x.extra[e]
|
416 |
+
and x.extra[e]["card"]["name"] in ["Second Yellow", "Red Card"]
|
417 |
+
for e in ["foul_committed", "bad_behaviour"]
|
418 |
+
),
|
419 |
+
axis=1,
|
420 |
+
)
|
421 |
+
]
|
422 |
+
# stats for starting XI
|
423 |
+
for startxi in events[events.type_name == "Starting XI"].itertuples():
|
424 |
+
team_id, team_name = startxi.team_id, startxi.team_name
|
425 |
+
for player in startxi.extra["tactics"]["lineup"]:
|
426 |
+
player = _flatten_id(player)
|
427 |
+
player = {
|
428 |
+
**player,
|
429 |
+
**{
|
430 |
+
"game_id": game_id,
|
431 |
+
"team_id": team_id,
|
432 |
+
"team_name": team_name,
|
433 |
+
"minutes_played": game_minutes,
|
434 |
+
},
|
435 |
+
}
|
436 |
+
player_red_card = red_cards[red_cards.player_id == player["player_id"]]
|
437 |
+
if len(player_red_card) > 0:
|
438 |
+
red_card_minute = player_red_card.iloc[0].minute
|
439 |
+
player["minutes_played"] = _expand_minute(red_card_minute, periods_minutes)
|
440 |
+
players[player["player_id"]] = player
|
441 |
+
# stats for substitutions
|
442 |
+
for substitution in events[events.type_name == "Substitution"].itertuples():
|
443 |
+
exp_sub_minute = _expand_minute(substitution.minute, periods_minutes)
|
444 |
+
replacement = {
|
445 |
+
"player_id": substitution.extra["substitution"]["replacement"]["id"],
|
446 |
+
"player_name": substitution.extra["substitution"]["replacement"]["name"],
|
447 |
+
"minutes_played": game_minutes - exp_sub_minute,
|
448 |
+
"team_id": substitution.team_id,
|
449 |
+
"game_id": game_id,
|
450 |
+
"team_name": substitution.team_name,
|
451 |
+
}
|
452 |
+
player_red_card = red_cards[red_cards.player_id == replacement["player_id"]]
|
453 |
+
if len(player_red_card) > 0:
|
454 |
+
red_card_minute = player_red_card.iloc[0].minute
|
455 |
+
replacement["minutes_played"] = (
|
456 |
+
_expand_minute(red_card_minute, periods_minutes) - exp_sub_minute
|
457 |
+
)
|
458 |
+
players[replacement["player_id"]] = replacement
|
459 |
+
players[substitution.player_id]["minutes_played"] = exp_sub_minute
|
460 |
+
pg = pd.DataFrame(players.values()).fillna(0)
|
461 |
+
for col in pg.columns:
|
462 |
+
if "_id" in col:
|
463 |
+
pg[col] = pg[col].astype(int) # pylint: disable=E1136,E1137
|
464 |
+
return pg
|
465 |
+
|
466 |
+
|
467 |
+
def _flatten_id(d: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
468 |
+
newd = {}
|
469 |
+
extra = {}
|
470 |
+
for k, v in d.items():
|
471 |
+
if isinstance(v, dict):
|
472 |
+
if "id" in v and "name" in v:
|
473 |
+
newd[k + "_id"] = v["id"]
|
474 |
+
newd[k + "_name"] = v["name"]
|
475 |
+
else:
|
476 |
+
extra[k] = v
|
477 |
+
else:
|
478 |
+
newd[k] = v
|
479 |
+
newd["extra"] = extra
|
480 |
+
return newd
|
481 |
+
|
482 |
+
|
483 |
+
def _flatten(d: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
484 |
+
newd = {}
|
485 |
+
for k, v in d.items():
|
486 |
+
if isinstance(v, dict):
|
487 |
+
if "id" in v and "name" in v:
|
488 |
+
newd[k + "_id"] = v["id"]
|
489 |
+
newd[k + "_name"] = v["name"]
|
490 |
+
newd[k + "_extra"] = {l: w for (l, w) in v.items() if l in ("id", "name")}
|
491 |
+
else:
|
492 |
+
newd = {**newd, **_flatten(v)}
|
493 |
+
else:
|
494 |
+
newd[k] = v
|
495 |
+
return newd
|
data/statsbomb/schema.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SPADL schema for StatsBomb data."""
|
2 |
+
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import pandera as pa
|
6 |
+
from pandera.typing import Object, Series, Timedelta
|
7 |
+
|
8 |
+
from socceraction.data.schema import (
|
9 |
+
CompetitionSchema,
|
10 |
+
EventSchema,
|
11 |
+
GameSchema,
|
12 |
+
PlayerSchema,
|
13 |
+
TeamSchema,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
class StatsBombCompetitionSchema(CompetitionSchema):
|
18 |
+
"""Definition of a dataframe containing a list of competitions and seasons."""
|
19 |
+
|
20 |
+
country_name: Series[str]
|
21 |
+
"""The name of the country the competition relates to."""
|
22 |
+
competition_gender: Series[str]
|
23 |
+
"""The gender of the players competing in the competition."""
|
24 |
+
|
25 |
+
|
26 |
+
class StatsBombGameSchema(GameSchema):
|
27 |
+
"""Definition of a dataframe containing a list of games."""
|
28 |
+
|
29 |
+
competition_stage: Series[str]
|
30 |
+
"""The name of the phase of the competition this game is in."""
|
31 |
+
home_score: Series[int]
|
32 |
+
"""The final score of the home team."""
|
33 |
+
away_score: Series[int]
|
34 |
+
"""The final score of the away team."""
|
35 |
+
venue: Series[str] = pa.Field(nullable=True)
|
36 |
+
"""The name of the stadium where the game was played."""
|
37 |
+
referee: Series[str] = pa.Field(nullable=True)
|
38 |
+
"""The name of the referee."""
|
39 |
+
|
40 |
+
|
41 |
+
class StatsBombPlayerSchema(PlayerSchema):
|
42 |
+
"""Definition of a dataframe containing the list of players of a game."""
|
43 |
+
|
44 |
+
nickname: Series[str] = pa.Field(nullable=True)
|
45 |
+
"""The nickname of the player on the team."""
|
46 |
+
starting_position_id: Series[int]
|
47 |
+
"""The unique identifier for the starting position of the player on the team."""
|
48 |
+
starting_position_name: Series[str]
|
49 |
+
"""The name of the starting position of the player on the team."""
|
50 |
+
|
51 |
+
|
52 |
+
class StatsBombTeamSchema(TeamSchema):
|
53 |
+
"""Definition of a dataframe containing the list of teams of a game."""
|
54 |
+
|
55 |
+
|
56 |
+
class StatsBombEventSchema(EventSchema):
|
57 |
+
"""Definition of a dataframe containing event stream data of a game."""
|
58 |
+
|
59 |
+
index: Series[int]
|
60 |
+
"""Sequence notation for the ordering of events within each match."""
|
61 |
+
timestamp: Series[Timedelta]
|
62 |
+
"""Time in the match the event takes place, recorded to the millisecond."""
|
63 |
+
minute: Series[int]
|
64 |
+
"""The minutes on the clock at the time of this event."""
|
65 |
+
second: Series[int] = pa.Field(ge=0, le=59)
|
66 |
+
"""The second part of the timestamp."""
|
67 |
+
possession: Series[int]
|
68 |
+
"""Indicates the current unique possession in the game."""
|
69 |
+
possession_team_id: Series[int]
|
70 |
+
"""The ID of the team that started this possession in control of the ball."""
|
71 |
+
possession_team_name: Series[str]
|
72 |
+
"""The name of the team that started this possession in control of the ball."""
|
73 |
+
play_pattern_id: Series[int]
|
74 |
+
"""The ID of the play pattern relevant to this event."""
|
75 |
+
play_pattern_name: Series[str]
|
76 |
+
"""The name of the play pattern relevant to this event."""
|
77 |
+
team_name: Series[str]
|
78 |
+
"""The name of the team this event relates to."""
|
79 |
+
duration: Series[float] = pa.Field(nullable=True)
|
80 |
+
"""If relevant, the length in seconds the event lasted."""
|
81 |
+
extra: Series[Object]
|
82 |
+
"""A JSON string containing type-specific information."""
|
83 |
+
related_events: Series[Object]
|
84 |
+
"""A comma separated list of the IDs of related events."""
|
85 |
+
player_name: Series[str] = pa.Field(nullable=True)
|
86 |
+
"""The name of the player this event relates to."""
|
87 |
+
position_id: Series[float] = pa.Field(nullable=True)
|
88 |
+
"""The ID of the position the player was in at the time of this event."""
|
89 |
+
position_name: Series[str] = pa.Field(nullable=True)
|
90 |
+
"""The name of the position the player was in at the time of this event."""
|
91 |
+
location: Series[Object] = pa.Field(nullable=True)
|
92 |
+
"""Array containing the x and y coordinates of the event."""
|
93 |
+
under_pressure: Series[bool] = pa.Field(nullable=True)
|
94 |
+
"""Whether the action was performed while being pressured by an opponent."""
|
95 |
+
counterpress: Series[bool] = pa.Field(nullable=True)
|
96 |
+
"""Pressing actions within 5 seconds of an open play turnover."""
|
97 |
+
visible_area_360: Optional[Series[Object]] = pa.Field(nullable=True)
|
98 |
+
"""An array of coordinates describing the polygon visible to the camera / in the 360 frame."""
|
99 |
+
freeze_frame_360: Optional[Series[Object]] = pa.Field(nullable=True)
|
100 |
+
"""An array of freeze frame objects."""
|
data/wyscout/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for loading Wyscout event data."""
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"PublicWyscoutLoader",
|
5 |
+
"WyscoutLoader",
|
6 |
+
"WyscoutCompetitionSchema",
|
7 |
+
"WyscoutGameSchema",
|
8 |
+
"WyscoutPlayerSchema",
|
9 |
+
"WyscoutTeamSchema",
|
10 |
+
"WyscoutEventSchema",
|
11 |
+
]
|
12 |
+
|
13 |
+
from .loader import PublicWyscoutLoader, WyscoutLoader
|
14 |
+
from .schema import (
|
15 |
+
WyscoutCompetitionSchema,
|
16 |
+
WyscoutEventSchema,
|
17 |
+
WyscoutGameSchema,
|
18 |
+
WyscoutPlayerSchema,
|
19 |
+
WyscoutTeamSchema,
|
20 |
+
)
|
data/wyscout/loader.py
ADDED
@@ -0,0 +1,849 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Implements serializers for Wyscout data."""
|
2 |
+
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import warnings
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import Any, Callable, Optional, Union, cast
|
9 |
+
from urllib.error import HTTPError
|
10 |
+
from urllib.parse import urlparse
|
11 |
+
from urllib.request import urlopen, urlretrieve
|
12 |
+
from zipfile import ZipFile, is_zipfile
|
13 |
+
|
14 |
+
import pandas as pd # type: ignore
|
15 |
+
from pandera.typing import DataFrame
|
16 |
+
|
17 |
+
from ..base import (
|
18 |
+
EventDataLoader,
|
19 |
+
JSONType,
|
20 |
+
MissingDataError,
|
21 |
+
ParseError,
|
22 |
+
_auth_remoteloadjson,
|
23 |
+
_expand_minute,
|
24 |
+
_has_auth,
|
25 |
+
_localloadjson,
|
26 |
+
_remoteloadjson,
|
27 |
+
)
|
28 |
+
from .schema import (
|
29 |
+
WyscoutCompetitionSchema,
|
30 |
+
WyscoutEventSchema,
|
31 |
+
WyscoutGameSchema,
|
32 |
+
WyscoutPlayerSchema,
|
33 |
+
WyscoutTeamSchema,
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
class PublicWyscoutLoader(EventDataLoader):
|
38 |
+
"""
|
39 |
+
Load the public Wyscout dataset.
|
40 |
+
|
41 |
+
This dataset is a public release of event stream data, collected by Wyscout
|
42 |
+
(https://wyscout.com/) containing all matches of the 2017/18 season of the
|
43 |
+
top-5 European leagues (La Liga, Serie A, Bundesliga, Premier League, Ligue
|
44 |
+
1), the FIFA World Cup 2018, and UEFA Euro Cup 2016. For a detailed
|
45 |
+
description, see Pappalardo et al. [1]_.
|
46 |
+
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
root : str
|
50 |
+
Path where a local copy of the dataset is stored or where the
|
51 |
+
downloaded dataset should be stored.
|
52 |
+
download : bool
|
53 |
+
Whether to force a redownload of the data.
|
54 |
+
|
55 |
+
References
|
56 |
+
----------
|
57 |
+
.. [1] Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of
|
58 |
+
spatio-temporal match events in soccer competitions. Sci Data 6, 236
|
59 |
+
(2019). https://doi.org/10.1038/s41597-019-0247-7
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, root: Optional[str] = None, download: bool = False) -> None:
|
63 |
+
if root is None:
|
64 |
+
self.root = os.path.join(os.getcwd(), "wyscout_data")
|
65 |
+
os.makedirs(self.root, exist_ok=True)
|
66 |
+
else:
|
67 |
+
self.root = root
|
68 |
+
|
69 |
+
self.get = _localloadjson
|
70 |
+
|
71 |
+
if download or len(os.listdir(self.root)) == 0:
|
72 |
+
self._download_repo()
|
73 |
+
|
74 |
+
self._index = pd.DataFrame(
|
75 |
+
[
|
76 |
+
{
|
77 |
+
"competition_id": 524,
|
78 |
+
"season_id": 181248,
|
79 |
+
"season_name": "2017/2018",
|
80 |
+
"db_matches": "matches_Italy.json",
|
81 |
+
"db_events": "events_Italy.json",
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"competition_id": 364,
|
85 |
+
"season_id": 181150,
|
86 |
+
"season_name": "2017/2018",
|
87 |
+
"db_matches": "matches_England.json",
|
88 |
+
"db_events": "events_England.json",
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"competition_id": 795,
|
92 |
+
"season_id": 181144,
|
93 |
+
"season_name": "2017/2018",
|
94 |
+
"db_matches": "matches_Spain.json",
|
95 |
+
"db_events": "events_Spain.json",
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"competition_id": 412,
|
99 |
+
"season_id": 181189,
|
100 |
+
"season_name": "2017/2018",
|
101 |
+
"db_matches": "matches_France.json",
|
102 |
+
"db_events": "events_France.json",
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"competition_id": 426,
|
106 |
+
"season_id": 181137,
|
107 |
+
"season_name": "2017/2018",
|
108 |
+
"db_matches": "matches_Germany.json",
|
109 |
+
"db_events": "events_Germany.json",
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"competition_id": 102,
|
113 |
+
"season_id": 9291,
|
114 |
+
"season_name": "2016",
|
115 |
+
"db_matches": "matches_European_Championship.json",
|
116 |
+
"db_events": "events_European_Championship.json",
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"competition_id": 28,
|
120 |
+
"season_id": 10078,
|
121 |
+
"season_name": "2018",
|
122 |
+
"db_matches": "matches_World_Cup.json",
|
123 |
+
"db_events": "events_World_Cup.json",
|
124 |
+
},
|
125 |
+
]
|
126 |
+
).set_index(["competition_id", "season_id"])
|
127 |
+
self._match_index = self._create_match_index().set_index("match_id")
|
128 |
+
self._cache: Optional[dict[str, Any]] = None
|
129 |
+
|
130 |
+
def _download_repo(self) -> None:
|
131 |
+
dataset_urls = {
|
132 |
+
"competitions": "https://ndownloader.figshare.com/files/15073685",
|
133 |
+
"teams": "https://ndownloader.figshare.com/files/15073697",
|
134 |
+
"players": "https://ndownloader.figshare.com/files/15073721",
|
135 |
+
"matches": "https://ndownloader.figshare.com/files/14464622",
|
136 |
+
"events": "https://ndownloader.figshare.com/files/14464685",
|
137 |
+
}
|
138 |
+
# download and unzip Wyscout open data
|
139 |
+
for url in dataset_urls.values():
|
140 |
+
url_obj = urlopen(url).geturl()
|
141 |
+
path = Path(urlparse(url_obj).path)
|
142 |
+
file_name = os.path.join(self.root, path.name)
|
143 |
+
file_local, _ = urlretrieve(url_obj, file_name)
|
144 |
+
if is_zipfile(file_local):
|
145 |
+
with ZipFile(file_local) as zip_file:
|
146 |
+
zip_file.extractall(self.root)
|
147 |
+
|
148 |
+
def _create_match_index(self) -> pd.DataFrame:
|
149 |
+
df_matches = pd.concat(
|
150 |
+
[pd.DataFrame(self.get(path)) for path in glob.iglob(f"{self.root}/matches_*.json")]
|
151 |
+
)
|
152 |
+
df_matches.rename(
|
153 |
+
columns={
|
154 |
+
"wyId": "match_id",
|
155 |
+
"competitionId": "competition_id",
|
156 |
+
"seasonId": "season_id",
|
157 |
+
},
|
158 |
+
inplace=True,
|
159 |
+
)
|
160 |
+
return pd.merge(
|
161 |
+
df_matches[["match_id", "competition_id", "season_id"]],
|
162 |
+
self._index,
|
163 |
+
on=["competition_id", "season_id"],
|
164 |
+
how="left",
|
165 |
+
)
|
166 |
+
|
167 |
+
def competitions(self) -> DataFrame[WyscoutCompetitionSchema]:
|
168 |
+
"""Return a dataframe with all available competitions and seasons.
|
169 |
+
|
170 |
+
Returns
|
171 |
+
-------
|
172 |
+
pd.DataFrame
|
173 |
+
A dataframe containing all available competitions and seasons. See
|
174 |
+
:class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema.
|
175 |
+
"""
|
176 |
+
path = os.path.join(self.root, "competitions.json")
|
177 |
+
df_competitions = pd.DataFrame(self.get(path))
|
178 |
+
df_competitions.rename(
|
179 |
+
columns={"wyId": "competition_id", "name": "competition_name"}, inplace=True
|
180 |
+
)
|
181 |
+
df_competitions["country_name"] = df_competitions.apply(
|
182 |
+
lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1
|
183 |
+
)
|
184 |
+
df_competitions["competition_gender"] = "male"
|
185 |
+
df_competitions = pd.merge(
|
186 |
+
df_competitions,
|
187 |
+
self._index.reset_index()[["competition_id", "season_id", "season_name"]],
|
188 |
+
on="competition_id",
|
189 |
+
how="left",
|
190 |
+
)
|
191 |
+
return cast(
|
192 |
+
DataFrame[WyscoutCompetitionSchema],
|
193 |
+
df_competitions.reset_index()[
|
194 |
+
[
|
195 |
+
"competition_id",
|
196 |
+
"season_id",
|
197 |
+
"country_name",
|
198 |
+
"competition_name",
|
199 |
+
"competition_gender",
|
200 |
+
"season_name",
|
201 |
+
]
|
202 |
+
],
|
203 |
+
)
|
204 |
+
|
205 |
+
def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]:
|
206 |
+
"""Return a dataframe with all available games in a season.
|
207 |
+
|
208 |
+
Parameters
|
209 |
+
----------
|
210 |
+
competition_id : int
|
211 |
+
The ID of the competition.
|
212 |
+
season_id : int
|
213 |
+
The ID of the season.
|
214 |
+
|
215 |
+
Returns
|
216 |
+
-------
|
217 |
+
pd.DataFrame
|
218 |
+
A dataframe containing all available games. See
|
219 |
+
:class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema.
|
220 |
+
"""
|
221 |
+
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"])
|
222 |
+
df_matches = pd.DataFrame(self.get(path))
|
223 |
+
return cast(DataFrame[WyscoutGameSchema], _convert_games(df_matches))
|
224 |
+
|
225 |
+
def _lineups(self, game_id: int) -> list[dict[str, Any]]:
|
226 |
+
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
|
227 |
+
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"])
|
228 |
+
df_matches = pd.DataFrame(self.get(path)).set_index("wyId")
|
229 |
+
return list(df_matches.at[game_id, "teamsData"].values())
|
230 |
+
|
231 |
+
def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]:
|
232 |
+
"""Return a dataframe with both teams that participated in a game.
|
233 |
+
|
234 |
+
Parameters
|
235 |
+
----------
|
236 |
+
game_id : int
|
237 |
+
The ID of the game.
|
238 |
+
|
239 |
+
Returns
|
240 |
+
-------
|
241 |
+
pd.DataFrame
|
242 |
+
A dataframe containing both teams. See
|
243 |
+
:class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema.
|
244 |
+
"""
|
245 |
+
path = os.path.join(self.root, "teams.json")
|
246 |
+
df_teams = pd.DataFrame(self.get(path)).set_index("wyId")
|
247 |
+
df_teams_match_id = pd.DataFrame(self._lineups(game_id))["teamId"]
|
248 |
+
df_teams_match = df_teams.loc[df_teams_match_id].reset_index()
|
249 |
+
return cast(DataFrame[WyscoutTeamSchema], _convert_teams(df_teams_match))
|
250 |
+
|
251 |
+
def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]:
|
252 |
+
"""Return a dataframe with all players that participated in a game.
|
253 |
+
|
254 |
+
Parameters
|
255 |
+
----------
|
256 |
+
game_id : int
|
257 |
+
The ID of the game.
|
258 |
+
|
259 |
+
Returns
|
260 |
+
-------
|
261 |
+
pd.DataFrame
|
262 |
+
A dataframe containing all players. See
|
263 |
+
:class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema.
|
264 |
+
"""
|
265 |
+
path = os.path.join(self.root, "players.json")
|
266 |
+
df_players = pd.DataFrame(self.get(path)).set_index("wyId")
|
267 |
+
lineups = self._lineups(game_id)
|
268 |
+
players_match = []
|
269 |
+
for team in lineups:
|
270 |
+
playerlist = team["formation"]["lineup"]
|
271 |
+
if team["formation"]["substitutions"] != "null":
|
272 |
+
for p in team["formation"]["substitutions"]:
|
273 |
+
try:
|
274 |
+
playerlist.append(
|
275 |
+
next(
|
276 |
+
item
|
277 |
+
for item in team["formation"]["bench"]
|
278 |
+
if item["playerId"] == p["playerIn"]
|
279 |
+
)
|
280 |
+
)
|
281 |
+
except StopIteration:
|
282 |
+
warnings.warn(
|
283 |
+
f'A player with ID={p["playerIn"]} was substituted '
|
284 |
+
f'in the {p["minute"]}th minute of game {game_id}, but '
|
285 |
+
"could not be found on the bench."
|
286 |
+
)
|
287 |
+
df = pd.DataFrame(playerlist)
|
288 |
+
df["side"] = team["side"]
|
289 |
+
df["team_id"] = team["teamId"]
|
290 |
+
players_match.append(df)
|
291 |
+
df_players_match = (
|
292 |
+
pd.concat(players_match)
|
293 |
+
.rename(columns={"playerId": "wyId"})
|
294 |
+
.set_index("wyId")
|
295 |
+
.join(df_players, how="left")
|
296 |
+
)
|
297 |
+
df_players_match.reset_index(inplace=True)
|
298 |
+
for c in ["shortName", "lastName", "firstName"]:
|
299 |
+
df_players_match[c] = df_players_match[c].apply(
|
300 |
+
lambda x: x.encode().decode("unicode-escape")
|
301 |
+
)
|
302 |
+
df_players_match = _convert_players(df_players_match)
|
303 |
+
|
304 |
+
# get minutes played
|
305 |
+
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
|
306 |
+
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"])
|
307 |
+
if self._cache is not None and self._cache["path"] == path:
|
308 |
+
df_events = self._cache["events"]
|
309 |
+
else:
|
310 |
+
df_events = pd.DataFrame(self.get(path)).set_index("matchId")
|
311 |
+
# avoid that this large json file has to be parsed again for
|
312 |
+
# each game when loading a batch of games from the same season
|
313 |
+
self._cache = {"path": path, "events": df_events}
|
314 |
+
match_events = df_events.loc[game_id].reset_index().to_dict("records")
|
315 |
+
mp = _get_minutes_played(lineups, match_events)
|
316 |
+
df_players_match = pd.merge(df_players_match, mp, on="player_id", how="right")
|
317 |
+
df_players_match["minutes_played"] = df_players_match.minutes_played.fillna(0)
|
318 |
+
df_players_match["game_id"] = game_id
|
319 |
+
return cast(DataFrame[WyscoutPlayerSchema], df_players_match)
|
320 |
+
|
321 |
+
def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]:
|
322 |
+
"""Return a dataframe with the event stream of a game.
|
323 |
+
|
324 |
+
Parameters
|
325 |
+
----------
|
326 |
+
game_id : int
|
327 |
+
The ID of the game.
|
328 |
+
|
329 |
+
Returns
|
330 |
+
-------
|
331 |
+
pd.DataFrame
|
332 |
+
A dataframe containing the event stream. See
|
333 |
+
:class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema.
|
334 |
+
"""
|
335 |
+
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
|
336 |
+
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"])
|
337 |
+
if self._cache is not None and self._cache["path"] == path:
|
338 |
+
df_events = self._cache["events"]
|
339 |
+
else:
|
340 |
+
df_events = pd.DataFrame(self.get(path)).set_index("matchId")
|
341 |
+
# avoid that this large json file has to be parsed again for
|
342 |
+
# each game when loading a batch of games from the same season
|
343 |
+
self._cache = {"path": path, "events": df_events}
|
344 |
+
return cast(
|
345 |
+
DataFrame[WyscoutEventSchema], _convert_events(df_events.loc[game_id].reset_index())
|
346 |
+
)
|
347 |
+
|
348 |
+
|
349 |
+
class WyscoutLoader(EventDataLoader):
|
350 |
+
"""Load event data either from a remote location or from a local folder.
|
351 |
+
|
352 |
+
Parameters
|
353 |
+
----------
|
354 |
+
root : str
|
355 |
+
Root-path of the data.
|
356 |
+
getter : str or callable, default: "remote"
|
357 |
+
"remote", "local" or a function that returns loads JSON data from a path.
|
358 |
+
feeds : dict(str, str)
|
359 |
+
Glob pattern for each feed that should be parsed. The default feeds for
|
360 |
+
a "remote" getter are::
|
361 |
+
|
362 |
+
{
|
363 |
+
'competitions': 'competitions',
|
364 |
+
'seasons': 'competitions/{season_id}/seasons',
|
365 |
+
'games': 'seasons/{season_id}/matches',
|
366 |
+
'events': 'matches/{game_id}/events?fetch=teams,players,match,substitutions'
|
367 |
+
}
|
368 |
+
|
369 |
+
The default feeds for a "local" getter are::
|
370 |
+
|
371 |
+
{
|
372 |
+
'competitions': 'competitions.json',
|
373 |
+
'seasons': 'seasons_{competition_id}.json',
|
374 |
+
'games': 'matches_{season_id}.json',
|
375 |
+
'events': 'matches/events_{game_id}.json',
|
376 |
+
}
|
377 |
+
|
378 |
+
creds: dict, optional
|
379 |
+
Login credentials in the format {"user": "", "passwd": ""}. Only used
|
380 |
+
when getter is "remote".
|
381 |
+
"""
|
382 |
+
|
383 |
+
_wyscout_api: str = "https://apirest.wyscout.com/v2/"
|
384 |
+
|
385 |
+
def __init__(
|
386 |
+
self,
|
387 |
+
root: str = _wyscout_api,
|
388 |
+
getter: Union[str, Callable[[str], JSONType]] = "remote",
|
389 |
+
feeds: Optional[dict[str, str]] = None,
|
390 |
+
creds: Optional[dict[str, str]] = None,
|
391 |
+
) -> None:
|
392 |
+
self.root = root
|
393 |
+
|
394 |
+
# Init credentials
|
395 |
+
if creds is None:
|
396 |
+
creds = {
|
397 |
+
"user": os.environ.get("WY_USERNAME", ""),
|
398 |
+
"passwd": os.environ.get("WY_PASSWORD", ""),
|
399 |
+
}
|
400 |
+
|
401 |
+
# Init getter
|
402 |
+
if getter == "remote":
|
403 |
+
self.get = _remoteloadjson
|
404 |
+
if _has_auth(creds):
|
405 |
+
_auth_remoteloadjson(creds["user"], creds["passwd"])
|
406 |
+
elif getter == "local":
|
407 |
+
self.get = _localloadjson
|
408 |
+
else:
|
409 |
+
self.get = getter # type: ignore
|
410 |
+
|
411 |
+
# Set up feeds
|
412 |
+
if feeds is not None:
|
413 |
+
self.feeds = feeds
|
414 |
+
elif getter == "remote":
|
415 |
+
self.feeds = {
|
416 |
+
"seasons": "competitions/{competition_id}/seasons?fetch=competition",
|
417 |
+
"games": "seasons/{season_id}/matches",
|
418 |
+
"events": "matches/{game_id}/events?fetch=teams,players,match,coaches,referees,formations,substitutions", # noqa: B950
|
419 |
+
}
|
420 |
+
elif getter == "local":
|
421 |
+
self.feeds = {
|
422 |
+
"competitions": "competitions.json",
|
423 |
+
"seasons": "seasons_{competition_id}.json",
|
424 |
+
"games": "matches_{season_id}.json",
|
425 |
+
"events": "matches/events_{game_id}.json",
|
426 |
+
}
|
427 |
+
else:
|
428 |
+
raise ValueError("No feeds specified.")
|
429 |
+
|
430 |
+
def _get_file_or_url(
|
431 |
+
self,
|
432 |
+
feed: str,
|
433 |
+
competition_id: Optional[int] = None,
|
434 |
+
season_id: Optional[int] = None,
|
435 |
+
game_id: Optional[int] = None,
|
436 |
+
) -> list[str]:
|
437 |
+
competition_id_glob = "*" if competition_id is None else competition_id
|
438 |
+
season_id_glob = "*" if season_id is None else season_id
|
439 |
+
game_id_glob = "*" if game_id is None else game_id
|
440 |
+
glob_pattern = self.feeds[feed].format(
|
441 |
+
competition_id=competition_id_glob, season_id=season_id_glob, game_id=game_id_glob
|
442 |
+
)
|
443 |
+
if "*" in glob_pattern:
|
444 |
+
files = glob.glob(os.path.join(self.root, glob_pattern))
|
445 |
+
if len(files) == 0:
|
446 |
+
raise MissingDataError
|
447 |
+
return files
|
448 |
+
return [glob_pattern]
|
449 |
+
|
450 |
+
def competitions(
|
451 |
+
self, competition_id: Optional[int] = None
|
452 |
+
) -> DataFrame[WyscoutCompetitionSchema]:
|
453 |
+
"""Return a dataframe with all available competitions and seasons.
|
454 |
+
|
455 |
+
Parameters
|
456 |
+
----------
|
457 |
+
competition_id : int, optional
|
458 |
+
The ID of the competition.
|
459 |
+
|
460 |
+
Raises
|
461 |
+
------
|
462 |
+
ParseError
|
463 |
+
When the raw data does not adhere to the expected format.
|
464 |
+
|
465 |
+
Returns
|
466 |
+
-------
|
467 |
+
pd.DataFrame
|
468 |
+
A dataframe containing all available competitions and seasons. See
|
469 |
+
:class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema.
|
470 |
+
"""
|
471 |
+
# Get all competitions
|
472 |
+
if "competitions" in self.feeds:
|
473 |
+
competitions_url = self._get_file_or_url("competitions")[0]
|
474 |
+
path = os.path.join(self.root, competitions_url)
|
475 |
+
obj = self.get(path)
|
476 |
+
if not isinstance(obj, dict) or "competitions" not in obj:
|
477 |
+
raise ParseError(f"{path} should contain a list of competitions")
|
478 |
+
seasons_urls = [
|
479 |
+
self._get_file_or_url("seasons", competition_id=c["wyId"])[0]
|
480 |
+
for c in obj["competitions"]
|
481 |
+
]
|
482 |
+
else:
|
483 |
+
seasons_urls = self._get_file_or_url("seasons", competition_id=competition_id)
|
484 |
+
# Get seasons in each competition
|
485 |
+
competitions = []
|
486 |
+
seasons = []
|
487 |
+
for seasons_url in seasons_urls:
|
488 |
+
try:
|
489 |
+
path = os.path.join(self.root, seasons_url)
|
490 |
+
obj = self.get(path)
|
491 |
+
if not isinstance(obj, dict) or "competition" not in obj or "seasons" not in obj:
|
492 |
+
raise ParseError(
|
493 |
+
f"{path} should contain a list of competition and list of seasons"
|
494 |
+
)
|
495 |
+
competitions.append(obj["competition"])
|
496 |
+
seasons.extend([s["season"] for s in obj["seasons"]])
|
497 |
+
except FileNotFoundError:
|
498 |
+
warnings.warn(f"File not found: {seasons_url}")
|
499 |
+
df_competitions = _convert_competitions(pd.DataFrame(competitions))
|
500 |
+
df_seasons = _convert_seasons(pd.DataFrame(seasons))
|
501 |
+
# Merge into a single dataframe
|
502 |
+
return cast(
|
503 |
+
DataFrame[WyscoutCompetitionSchema],
|
504 |
+
pd.merge(df_competitions, df_seasons, on="competition_id"),
|
505 |
+
)
|
506 |
+
|
507 |
+
def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]:
|
508 |
+
"""Return a dataframe with all available games in a season.
|
509 |
+
|
510 |
+
Parameters
|
511 |
+
----------
|
512 |
+
competition_id : int
|
513 |
+
The ID of the competition.
|
514 |
+
season_id : int
|
515 |
+
The ID of the season.
|
516 |
+
|
517 |
+
Raises
|
518 |
+
------
|
519 |
+
ParseError
|
520 |
+
When the raw data does not adhere to the expected format.
|
521 |
+
|
522 |
+
Returns
|
523 |
+
-------
|
524 |
+
pd.DataFrame
|
525 |
+
A dataframe containing all available games. See
|
526 |
+
:class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema.
|
527 |
+
"""
|
528 |
+
# Get all games
|
529 |
+
if "games" in self.feeds:
|
530 |
+
games_url = self._get_file_or_url(
|
531 |
+
"games", competition_id=competition_id, season_id=season_id
|
532 |
+
)[0]
|
533 |
+
path = os.path.join(self.root, games_url)
|
534 |
+
obj = self.get(path)
|
535 |
+
if not isinstance(obj, dict) or "matches" not in obj:
|
536 |
+
raise ParseError(f"{path} should contain a list of matches")
|
537 |
+
gamedetails_urls = [
|
538 |
+
self._get_file_or_url(
|
539 |
+
"events",
|
540 |
+
competition_id=competition_id,
|
541 |
+
season_id=season_id,
|
542 |
+
game_id=g["matchId"],
|
543 |
+
)[0]
|
544 |
+
for g in obj["matches"]
|
545 |
+
]
|
546 |
+
else:
|
547 |
+
gamedetails_urls = self._get_file_or_url(
|
548 |
+
"events", competition_id=competition_id, season_id=season_id
|
549 |
+
)
|
550 |
+
games = []
|
551 |
+
for gamedetails_url in gamedetails_urls:
|
552 |
+
try:
|
553 |
+
path = os.path.join(self.root, gamedetails_url)
|
554 |
+
obj = self.get(path)
|
555 |
+
if not isinstance(obj, dict) or "match" not in obj:
|
556 |
+
raise ParseError(f"{path} should contain a match")
|
557 |
+
games.append(obj["match"])
|
558 |
+
except FileNotFoundError:
|
559 |
+
warnings.warn(f"File not found: {gamedetails_url}")
|
560 |
+
except HTTPError:
|
561 |
+
warnings.warn(f"Resource not found: {gamedetails_url}")
|
562 |
+
df_games = _convert_games(pd.DataFrame(games))
|
563 |
+
return cast(DataFrame[WyscoutGameSchema], df_games)
|
564 |
+
|
565 |
+
def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]:
|
566 |
+
"""Return a dataframe with both teams that participated in a game.
|
567 |
+
|
568 |
+
Parameters
|
569 |
+
----------
|
570 |
+
game_id : int
|
571 |
+
The ID of the game.
|
572 |
+
|
573 |
+
Raises
|
574 |
+
------
|
575 |
+
ParseError
|
576 |
+
When the raw data does not adhere to the expected format.
|
577 |
+
|
578 |
+
Returns
|
579 |
+
-------
|
580 |
+
pd.DataFrame
|
581 |
+
A dataframe containing both teams. See
|
582 |
+
:class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema.
|
583 |
+
"""
|
584 |
+
events_url = self._get_file_or_url("events", game_id=game_id)[0]
|
585 |
+
path = os.path.join(self.root, events_url)
|
586 |
+
obj = self.get(path)
|
587 |
+
if not isinstance(obj, dict) or "teams" not in obj:
|
588 |
+
raise ParseError(f"{path} should contain a list of matches")
|
589 |
+
teams = [t["team"] for t in obj["teams"].values() if t.get("team")]
|
590 |
+
df_teams = _convert_teams(pd.DataFrame(teams))
|
591 |
+
return cast(DataFrame[WyscoutTeamSchema], df_teams)
|
592 |
+
|
593 |
+
def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]:
|
594 |
+
"""Return a dataframe with all players that participated in a game.
|
595 |
+
|
596 |
+
Parameters
|
597 |
+
----------
|
598 |
+
game_id : int
|
599 |
+
The ID of the game.
|
600 |
+
|
601 |
+
Raises
|
602 |
+
------
|
603 |
+
ParseError
|
604 |
+
When the raw data does not adhere to the expected format.
|
605 |
+
|
606 |
+
Returns
|
607 |
+
-------
|
608 |
+
pd.DataFrame
|
609 |
+
A dataframe containing all players. See
|
610 |
+
:class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema.
|
611 |
+
"""
|
612 |
+
events_url = self._get_file_or_url("events", game_id=game_id)[0]
|
613 |
+
path = os.path.join(self.root, events_url)
|
614 |
+
obj = self.get(path)
|
615 |
+
if not isinstance(obj, dict) or "players" not in obj:
|
616 |
+
raise ParseError(f"{path} should contain a list of players")
|
617 |
+
players = [
|
618 |
+
player["player"]
|
619 |
+
for team in obj["players"].values()
|
620 |
+
for player in team
|
621 |
+
if player.get("player")
|
622 |
+
]
|
623 |
+
df_players = _convert_players(pd.DataFrame(players).drop_duplicates("wyId"))
|
624 |
+
df_players = pd.merge(
|
625 |
+
df_players,
|
626 |
+
_get_minutes_played(obj["match"]["teamsData"], obj["events"]),
|
627 |
+
on="player_id",
|
628 |
+
how="right",
|
629 |
+
)
|
630 |
+
df_players["minutes_played"] = df_players.minutes_played.fillna(0)
|
631 |
+
df_players["game_id"] = game_id
|
632 |
+
return cast(DataFrame[WyscoutPlayerSchema], df_players)
|
633 |
+
|
634 |
+
def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]:
|
635 |
+
"""Return a dataframe with the event stream of a game.
|
636 |
+
|
637 |
+
Parameters
|
638 |
+
----------
|
639 |
+
game_id : int
|
640 |
+
The ID of the game.
|
641 |
+
|
642 |
+
Raises
|
643 |
+
------
|
644 |
+
ParseError
|
645 |
+
When the raw data does not adhere to the expected format.
|
646 |
+
|
647 |
+
Returns
|
648 |
+
-------
|
649 |
+
pd.DataFrame
|
650 |
+
A dataframe containing the event stream. See
|
651 |
+
:class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema.
|
652 |
+
"""
|
653 |
+
events_url = self._get_file_or_url("events", game_id=game_id)[0]
|
654 |
+
path = os.path.join(self.root, events_url)
|
655 |
+
obj = self.get(path)
|
656 |
+
if not isinstance(obj, dict) or "events" not in obj:
|
657 |
+
raise ParseError(f"{path} should contain a list of events")
|
658 |
+
df_events = _convert_events(pd.DataFrame(obj["events"]))
|
659 |
+
return cast(DataFrame[WyscoutEventSchema], df_events)
|
660 |
+
|
661 |
+
|
662 |
+
def _convert_competitions(competitions: pd.DataFrame) -> pd.DataFrame:
|
663 |
+
competitionsmapping = {
|
664 |
+
"wyId": "competition_id",
|
665 |
+
"name": "competition_name",
|
666 |
+
"gender": "competition_gender",
|
667 |
+
}
|
668 |
+
cols = ["competition_id", "competition_name", "country_name", "competition_gender"]
|
669 |
+
competitions["country_name"] = competitions.apply(
|
670 |
+
lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1
|
671 |
+
)
|
672 |
+
competitions = competitions.rename(columns=competitionsmapping)[cols]
|
673 |
+
return competitions
|
674 |
+
|
675 |
+
|
676 |
+
def _convert_seasons(seasons: pd.DataFrame) -> pd.DataFrame:
|
677 |
+
seasonsmapping = {
|
678 |
+
"wyId": "season_id",
|
679 |
+
"name": "season_name",
|
680 |
+
"competitionId": "competition_id",
|
681 |
+
}
|
682 |
+
cols = ["season_id", "season_name", "competition_id"]
|
683 |
+
seasons = seasons.rename(columns=seasonsmapping)[cols]
|
684 |
+
return seasons
|
685 |
+
|
686 |
+
|
687 |
+
def _convert_games(matches: pd.DataFrame) -> pd.DataFrame:
|
688 |
+
gamesmapping = {
|
689 |
+
"wyId": "game_id",
|
690 |
+
"dateutc": "game_date",
|
691 |
+
"competitionId": "competition_id",
|
692 |
+
"seasonId": "season_id",
|
693 |
+
"gameweek": "game_day",
|
694 |
+
}
|
695 |
+
cols = ["game_id", "competition_id", "season_id", "game_date", "game_day"]
|
696 |
+
games = matches.rename(columns=gamesmapping)[cols]
|
697 |
+
games["game_date"] = pd.to_datetime(games["game_date"])
|
698 |
+
games["home_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "home"))
|
699 |
+
games["away_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "away"))
|
700 |
+
return games
|
701 |
+
|
702 |
+
|
703 |
+
def _get_team_id(teamsData: dict[int, Any], side: str) -> int:
|
704 |
+
for team_id, data in teamsData.items():
|
705 |
+
if data["side"] == side:
|
706 |
+
return int(team_id)
|
707 |
+
raise ValueError()
|
708 |
+
|
709 |
+
|
710 |
+
def _convert_players(players: pd.DataFrame) -> pd.DataFrame:
|
711 |
+
playermapping = {
|
712 |
+
"wyId": "player_id",
|
713 |
+
"shortName": "nickname",
|
714 |
+
"firstName": "firstname",
|
715 |
+
"lastName": "lastname",
|
716 |
+
"birthDate": "birth_date",
|
717 |
+
}
|
718 |
+
cols = ["player_id", "nickname", "firstname", "lastname", "birth_date"]
|
719 |
+
df_players = players.rename(columns=playermapping)[cols]
|
720 |
+
df_players["player_name"] = df_players[["firstname", "lastname"]].agg(" ".join, axis=1)
|
721 |
+
df_players["birth_date"] = pd.to_datetime(df_players["birth_date"])
|
722 |
+
return df_players
|
723 |
+
|
724 |
+
|
725 |
+
def _convert_teams(teams: pd.DataFrame) -> pd.DataFrame:
|
726 |
+
teammapping = {
|
727 |
+
"wyId": "team_id",
|
728 |
+
"name": "team_name_short",
|
729 |
+
"officialName": "team_name",
|
730 |
+
}
|
731 |
+
cols = ["team_id", "team_name_short", "team_name"]
|
732 |
+
return teams.rename(columns=teammapping)[cols]
|
733 |
+
|
734 |
+
|
735 |
+
def _convert_events(raw_events: pd.DataFrame) -> pd.DataFrame:
|
736 |
+
eventmapping = {
|
737 |
+
"id": "event_id",
|
738 |
+
"match_id": "game_id",
|
739 |
+
"event_name": "type_name",
|
740 |
+
"sub_event_name": "subtype_name",
|
741 |
+
}
|
742 |
+
cols = [
|
743 |
+
"event_id",
|
744 |
+
"game_id",
|
745 |
+
"period_id",
|
746 |
+
"milliseconds",
|
747 |
+
"team_id",
|
748 |
+
"player_id",
|
749 |
+
"type_id",
|
750 |
+
"type_name",
|
751 |
+
"subtype_id",
|
752 |
+
"subtype_name",
|
753 |
+
"positions",
|
754 |
+
"tags",
|
755 |
+
]
|
756 |
+
events = raw_events.copy()
|
757 |
+
# Camel case to snake case column names
|
758 |
+
pattern = re.compile(r"(?<!^)(?=[A-Z])")
|
759 |
+
events.columns = [pattern.sub("_", c).lower() for c in events.columns]
|
760 |
+
#
|
761 |
+
events["type_id"] = (
|
762 |
+
pd.to_numeric(
|
763 |
+
events["event_id"] if "event_id" in events.columns else None, errors="coerce"
|
764 |
+
)
|
765 |
+
.fillna(0)
|
766 |
+
.astype(int)
|
767 |
+
)
|
768 |
+
del events["event_id"]
|
769 |
+
events["subtype_id"] = (
|
770 |
+
pd.to_numeric(
|
771 |
+
events["sub_event_id"] if "sub_event_id" in events.columns else None, errors="coerce"
|
772 |
+
)
|
773 |
+
.fillna(0)
|
774 |
+
.astype(int)
|
775 |
+
)
|
776 |
+
del events["sub_event_id"]
|
777 |
+
events["period_id"] = events.match_period.apply(lambda x: wyscout_periods[x])
|
778 |
+
events["milliseconds"] = events.event_sec * 1000
|
779 |
+
return events.rename(columns=eventmapping)[cols]
|
780 |
+
|
781 |
+
|
782 |
+
def _get_minutes_played(
|
783 |
+
teamsData: list[dict[str, Any]], events: list[dict[str, Any]]
|
784 |
+
) -> pd.DataFrame:
|
785 |
+
# get duration of each period
|
786 |
+
periods_ts = {i: [0] for i in range(6)}
|
787 |
+
for e in events:
|
788 |
+
period_id = wyscout_periods[e["matchPeriod"]]
|
789 |
+
periods_ts[period_id].append(e["eventSec"])
|
790 |
+
periods_duration = [
|
791 |
+
round(max(periods_ts[i]) / 60) for i in range(5) if max(periods_ts[i]) != 0
|
792 |
+
]
|
793 |
+
# get duration of entire match
|
794 |
+
duration = sum(periods_duration)
|
795 |
+
|
796 |
+
# get stats for each player
|
797 |
+
playergames: dict[int, dict[str, Any]] = {}
|
798 |
+
if isinstance(teamsData, dict):
|
799 |
+
teamsData = list(teamsData.values())
|
800 |
+
for teamData in teamsData:
|
801 |
+
formation = teamData.get("formation", {})
|
802 |
+
substitutions = formation.get("substitutions", [])
|
803 |
+
red_cards = {
|
804 |
+
player["playerId"]: _expand_minute(int(player["redCards"]), periods_duration)
|
805 |
+
for key in ["bench", "lineup"]
|
806 |
+
for player in formation.get(key, [])
|
807 |
+
if player["redCards"] != "0"
|
808 |
+
}
|
809 |
+
pg = {
|
810 |
+
player["playerId"]: {
|
811 |
+
"team_id": teamData["teamId"],
|
812 |
+
"player_id": player["playerId"],
|
813 |
+
"jersey_number": player.get("shirtNumber", 0),
|
814 |
+
"minutes_played": red_cards.get(player["playerId"], duration),
|
815 |
+
"is_starter": True,
|
816 |
+
}
|
817 |
+
for player in formation.get("lineup", [])
|
818 |
+
}
|
819 |
+
|
820 |
+
# correct minutes played for substituted players
|
821 |
+
if substitutions != "null":
|
822 |
+
for substitution in substitutions:
|
823 |
+
expanded_minute_sub = _expand_minute(substitution["minute"], periods_duration)
|
824 |
+
substitute = {
|
825 |
+
"team_id": teamData["teamId"],
|
826 |
+
"player_id": substitution["playerIn"],
|
827 |
+
"jersey_number": next(
|
828 |
+
(
|
829 |
+
p.get("shirtNumber", 0)
|
830 |
+
for p in formation.get("bench", [])
|
831 |
+
if p["playerId"] == substitution["playerIn"]
|
832 |
+
),
|
833 |
+
0,
|
834 |
+
),
|
835 |
+
"minutes_played": duration - expanded_minute_sub,
|
836 |
+
"is_starter": False,
|
837 |
+
}
|
838 |
+
if substitution["playerIn"] in red_cards:
|
839 |
+
substitute["minutes_played"] = (
|
840 |
+
red_cards[substitution["playerIn"]] - expanded_minute_sub
|
841 |
+
)
|
842 |
+
pg[substitution["playerIn"]] = substitute
|
843 |
+
pg[substitution["playerOut"]]["minutes_played"] = expanded_minute_sub
|
844 |
+
|
845 |
+
playergames = {**playergames, **pg}
|
846 |
+
return pd.DataFrame(playergames.values())
|
847 |
+
|
848 |
+
|
849 |
+
wyscout_periods = {"1H": 1, "2H": 2, "E1": 3, "E2": 4, "P": 5}
|
data/wyscout/schema.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SPADL schema for Wyscout data."""
|
2 |
+
|
3 |
+
import pandera as pa
|
4 |
+
from pandera.typing import DateTime, Object, Series
|
5 |
+
|
6 |
+
from socceraction.data.schema import (
|
7 |
+
CompetitionSchema,
|
8 |
+
EventSchema,
|
9 |
+
GameSchema,
|
10 |
+
PlayerSchema,
|
11 |
+
TeamSchema,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
class WyscoutCompetitionSchema(CompetitionSchema):
|
16 |
+
"""Definition of a dataframe containing a list of competitions and seasons."""
|
17 |
+
|
18 |
+
country_name: Series[str]
|
19 |
+
competition_gender: Series[str]
|
20 |
+
|
21 |
+
|
22 |
+
class WyscoutGameSchema(GameSchema):
|
23 |
+
"""Definition of a dataframe containing a list of games."""
|
24 |
+
|
25 |
+
|
26 |
+
class WyscoutPlayerSchema(PlayerSchema):
|
27 |
+
"""Definition of a dataframe containing the list of teams of a game."""
|
28 |
+
|
29 |
+
firstname: Series[str]
|
30 |
+
lastname: Series[str]
|
31 |
+
nickname: Series[str] = pa.Field(nullable=True)
|
32 |
+
birth_date: Series[DateTime] = pa.Field(nullable=True)
|
33 |
+
|
34 |
+
|
35 |
+
class WyscoutTeamSchema(TeamSchema):
|
36 |
+
"""Definition of a dataframe containing the list of players of a game."""
|
37 |
+
|
38 |
+
team_name_short: Series[str]
|
39 |
+
|
40 |
+
|
41 |
+
class WyscoutEventSchema(EventSchema):
|
42 |
+
"""Definition of a dataframe containing event stream data of a game."""
|
43 |
+
|
44 |
+
milliseconds: Series[float]
|
45 |
+
subtype_id: Series[int]
|
46 |
+
subtype_name: Series[str]
|
47 |
+
positions: Series[Object]
|
48 |
+
tags: Series[Object]
|
docs/_static/custom.css
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a {
|
2 |
+
color: #26b079;
|
3 |
+
}
|
4 |
+
|
5 |
+
.toctree-l1 a:active,
|
6 |
+
.toctree-l1 a:hover {
|
7 |
+
background-color: #676767;
|
8 |
+
}
|
9 |
+
|
10 |
+
.sidebar-logo {
|
11 |
+
max-width: 100%;
|
12 |
+
}
|
13 |
+
|
14 |
+
.sidebar-drawer {
|
15 |
+
width: calc(50% - 25em);
|
16 |
+
min-width: 22em;
|
17 |
+
}
|
18 |
+
|
19 |
+
.sidebar-drawer .sidebar-container {
|
20 |
+
width: 23em;
|
21 |
+
}
|
22 |
+
|
23 |
+
li.toctree-l2 {
|
24 |
+
font-size: 80%;
|
25 |
+
}
|
26 |
+
|
27 |
+
@media (max-width: 67em) {
|
28 |
+
.sidebar-drawer {
|
29 |
+
width: 22em;
|
30 |
+
left: -22em;
|
31 |
+
}
|
32 |
+
.sidebar-drawer .sidebar-container {
|
33 |
+
width: 22em;
|
34 |
+
}
|
35 |
+
li.toctree-l2 {
|
36 |
+
font-size: 75%;
|
37 |
+
}
|
38 |
+
}
|
39 |
+
|
40 |
+
/* autosummary table text */
|
41 |
+
article .align-center,
|
42 |
+
article .align-default {
|
43 |
+
text-align: left;
|
44 |
+
}
|
45 |
+
|
46 |
+
table.dataframe {
|
47 |
+
font-size: 80%;
|
48 |
+
margin-left: 0;
|
49 |
+
margin-right: 0;
|
50 |
+
}
|
docs/_static/decroos19.bibtex
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@inproceedings{Decroos2019actions,
|
2 |
+
author = {Decroos, Tom and Bransen, Lotte and Van Haaren, Jan and Davis, Jesse},
|
3 |
+
title = {Actions Speak Louder Than Goals: Valuing Player Actions in Soccer},
|
4 |
+
booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
|
5 |
+
series = {KDD '19},
|
6 |
+
year = {2019},
|
7 |
+
isbn = {978-1-4503-6201-6},
|
8 |
+
location = {Anchorage, AK, USA},
|
9 |
+
pages = {1851--1861},
|
10 |
+
numpages = {11},
|
11 |
+
url = {http://doi.acm.org/10.1145/3292500.3330758},
|
12 |
+
doi = {10.1145/3292500.3330758},
|
13 |
+
acmid = {3330758},
|
14 |
+
publisher = {ACM},
|
15 |
+
address = {New York, NY, USA},
|
16 |
+
keywords = {event stream data, probabilistic classification, soccer match data, sports analytics, valuing actions},
|
17 |
+
}
|
docs/_static/favicon.png
ADDED
![]() |
docs/_static/logo.ai
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8f4add7db11daf2a6f1c77c8d26f84c0a227bea40e9b607f2930d15b75ae99e
|
3 |
+
size 153178
|
docs/_static/logo.png
ADDED
![]() |
docs/_static/logo_white.png
ADDED
![]() |
docs/_static/vanroy20.bibtex
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@conference{vanroy2020,
|
2 |
+
author = {Van Roy, Maaike and Robberechts, Pieter and Decroos, Tom and Davis, Jesse},
|
3 |
+
title = {Valuing On-the-Ball Actions in Soccer: A Critical Comparison of xT and VAEP},
|
4 |
+
series = {AITS},
|
5 |
+
booktitle = {Proceedings of the {AAAI}-20 Workshop on Artifical Intelligence in Team Sports},
|
6 |
+
publisher = {AI in Team Sports Organising Committee},
|
7 |
+
month = {dec},
|
8 |
+
year = {2020},
|
9 |
+
abstract = {Objectively quantifying a soccer player's contributions within a match is a challenging and crucial task in soccer analytics. Many of the currently available metrics focus on measuring the quality of shots and assists only, although these represent less than 1% of all on-the-ball actions. Most recently, several approaches were proposed to bridge this gap. By valuing how actions increase or decrease the likelihood of yielding a goal, these models are effective tools for quantifying the performances of players for all sorts of actions. However, we lack an understanding of their differences, both conceptually and in practice. Therefore, this paper critically compares two such models: expected threat (xT) and valuing actions by estimating probabilities (VAEP). Both approaches exhibit variety in their design choices, that leads to different top player rankings and major differences in how they value specific actions.},
|
10 |
+
keywords = {soccer},
|
11 |
+
}
|
docs/_templates/class.rst
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{{ fullname | escape | underline}}
|
2 |
+
|
3 |
+
.. currentmodule:: {{ module }}
|
4 |
+
|
5 |
+
.. autoclass:: {{ objname }}
|
6 |
+
|
7 |
+
{% block attributes %}
|
8 |
+
{% if attributes %}
|
9 |
+
.. rubric:: Attributes
|
10 |
+
|
11 |
+
.. autosummary::
|
12 |
+
:nosignatures:
|
13 |
+
|
14 |
+
{% for item in attributes %}
|
15 |
+
~{{ name }}.{{ item }}
|
16 |
+
{%- endfor %}
|
17 |
+
|
18 |
+
{% endif %}
|
19 |
+
{% endblock %}
|
20 |
+
|
21 |
+
{% block methods %}
|
22 |
+
{% if methods %}
|
23 |
+
.. rubric:: Methods
|
24 |
+
|
25 |
+
.. autosummary::
|
26 |
+
:nosignatures:
|
27 |
+
:toctree: methods
|
28 |
+
|
29 |
+
{% for item in methods %}
|
30 |
+
{%- if item not in inherited_members %}
|
31 |
+
~{{ name }}.{{ item }}
|
32 |
+
{%- endif %}
|
33 |
+
{%- endfor %}
|
34 |
+
{% endif %}
|
35 |
+
|
36 |
+
{%- if members and '__call__' in members %}
|
37 |
+
~{{ name }}.__call__
|
38 |
+
{%- endif %}
|
39 |
+
|
40 |
+
{% endblock %}
|
docs/_templates/module.rst
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.. empty
|
2 |
+
|
3 |
+
{{ fullname | escape | underline }}
|
4 |
+
|
5 |
+
.. currentmodule:: {{ fullname }}
|
6 |
+
|
7 |
+
.. automodule:: {{ fullname }}
|
8 |
+
|
9 |
+
{% block classes %}
|
10 |
+
|
11 |
+
{% for item in classes %}
|
12 |
+
.. autoclass:: {{ item }}
|
13 |
+
:members:
|
14 |
+
:member-order: bysource
|
15 |
+
:show-inheritance:
|
16 |
+
:exclude-members:
|
17 |
+
{%- endfor %}
|
18 |
+
|
19 |
+
{% endblock %}
|
20 |
+
|
21 |
+
{% block functions %}
|
22 |
+
|
23 |
+
{% for item in functions %}
|
24 |
+
.. autofunction:: {{ item }}
|
25 |
+
{%- endfor %}
|
26 |
+
|
27 |
+
{% endblock %}
|
docs/_templates/schema.rst
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{{ fullname | escape | underline}}
|
2 |
+
|
3 |
+
.. currentmodule:: {{ module }}
|
4 |
+
|
5 |
+
.. autoclass:: {{ objname }}
|
6 |
+
|
7 |
+
{% block attributes %}
|
8 |
+
{% if attributes %}
|
9 |
+
.. rubric:: Attributes
|
10 |
+
|
11 |
+
.. autosummary::
|
12 |
+
:nosignatures:
|
13 |
+
|
14 |
+
{% for item in attributes %}
|
15 |
+
~{{ name }}.{{ item }}
|
16 |
+
{%- endfor %}
|
17 |
+
|
18 |
+
{% endif %}
|
19 |
+
{% endblock %}
|
docs/actions_bra-bel.png
ADDED
![]() |