scfive commited on
Commit
d6ea71e
·
verified ·
1 Parent(s): 902259f

Upload 203 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. CONTRIBUTING.rst +205 -0
  3. LICENSE.rst +22 -0
  4. Makefile +50 -0
  5. README.md +66 -12
  6. __init__.py +13 -0
  7. atomic/__init__.py +1 -0
  8. atomic/spadl/__init__.py +15 -0
  9. atomic/spadl/base.py +236 -0
  10. atomic/spadl/config.py +48 -0
  11. atomic/spadl/schema.py +32 -0
  12. atomic/spadl/utils.py +65 -0
  13. atomic/vaep/__init__.py +6 -0
  14. atomic/vaep/base.py +80 -0
  15. atomic/vaep/features.py +269 -0
  16. atomic/vaep/formula.py +142 -0
  17. atomic/vaep/labels.py +108 -0
  18. data/__init__.py +9 -0
  19. data/base.py +211 -0
  20. data/opta/__init__.py +19 -0
  21. data/opta/loader.py +478 -0
  22. data/opta/parsers/__init__.py +23 -0
  23. data/opta/parsers/base.py +179 -0
  24. data/opta/parsers/f1_json.py +103 -0
  25. data/opta/parsers/f24_json.py +123 -0
  26. data/opta/parsers/f24_xml.py +108 -0
  27. data/opta/parsers/f7_xml.py +250 -0
  28. data/opta/parsers/f9_json.py +302 -0
  29. data/opta/parsers/ma1_json.py +264 -0
  30. data/opta/parsers/ma3_json.py +355 -0
  31. data/opta/parsers/whoscored.py +421 -0
  32. data/opta/schema.py +86 -0
  33. data/schema.py +110 -0
  34. data/statsbomb/__init__.py +20 -0
  35. data/statsbomb/loader.py +495 -0
  36. data/statsbomb/schema.py +100 -0
  37. data/wyscout/__init__.py +20 -0
  38. data/wyscout/loader.py +849 -0
  39. data/wyscout/schema.py +48 -0
  40. docs/_static/custom.css +50 -0
  41. docs/_static/decroos19.bibtex +17 -0
  42. docs/_static/favicon.png +0 -0
  43. docs/_static/logo.ai +3 -0
  44. docs/_static/logo.png +0 -0
  45. docs/_static/logo_white.png +0 -0
  46. docs/_static/vanroy20.bibtex +11 -0
  47. docs/_templates/class.rst +40 -0
  48. docs/_templates/module.rst +27 -0
  49. docs/_templates/schema.rst +19 -0
  50. docs/actions_bra-bel.png +0 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/_static/logo.ai filter=lfs diff=lfs merge=lfs -text
37
+ docs/documentation/valuing_actions/action_changes_gamestate.png filter=lfs diff=lfs merge=lfs -text
38
+ docs/documentation/valuing_actions/action.gif filter=lfs diff=lfs merge=lfs -text
39
+ docs/documentation/valuing_actions/default_xt_grid.png filter=lfs diff=lfs merge=lfs -text
CONTRIBUTING.rst ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Contributor guide
2
+ =================
3
+
4
+ This document lays out guidelines and advice for contributing to this project.
5
+ If you're thinking of contributing, please start by reading this document and
6
+ getting a feel for how contributing to this project works. If you have any
7
+ questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer.
8
+
9
+ .. _Pieter Robberechts: http://www.cs.kuleuven.be/cgi-bin/e-post.pl?epost=Pieter.Robberechts
10
+
11
+ The guide is split into sections based on the type of contribution you're
12
+ thinking of making.
13
+
14
+
15
+ .. _bug-reports:
16
+
17
+ Bug reports
18
+ -----------
19
+
20
+ Bug reports are hugely important! Before you raise one, though, please check
21
+ through the `GitHub issues`_, **both open and closed**, to confirm that the bug
22
+ hasn't been reported before.
23
+
24
+ When filing an issue, make sure to answer these questions:
25
+
26
+ - Which Python version are you using?
27
+ - Which version of socceraction are you using?
28
+ - What did you do?
29
+ - What did you expect to see?
30
+ - What did you see instead?
31
+
32
+ The best way to get your bug fixed is to provide a test case,
33
+ and/or steps to reproduce the issue.
34
+
35
+ .. _GitHub issues: https://github.com/ML-KULeuven/socceraction/issues
36
+
37
+
38
+ Feature requests
39
+ ----------------
40
+
41
+ Socceraction is not actively developed. Its primary use is to enable
42
+ reproducibility of our research. If you believe there is a feature missing,
43
+ feel free to raise a feature request on the `Issue Tracker`_, but please do be
44
+ aware that the overwhelming likelihood is that your feature request will not
45
+ be accepted.
46
+
47
+ .. _Issue tracker: https://github.com/ML-KULeuven/socceraction/issues
48
+
49
+
50
+ Documentation contributions
51
+ ---------------------------
52
+
53
+ Documentation improvements are always welcome! The documentation files live in
54
+ the ``docs/`` directory of the codebase. They're written in
55
+ `reStructuredText`_, and use `Sphinx`_ to generate the full suite of
56
+ documentation.
57
+
58
+ You do not have to set up a development environment to make small changes to
59
+ the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes.
60
+
61
+ When contributing documentation, please do your best to follow the style of the
62
+ documentation files. This means a soft-limit of 79 characters wide in your text
63
+ files and a semiformal, yet friendly and approachable, prose style.
64
+
65
+ When presenting Python code, use double-quoted strings (``"hello"`` instead of
66
+ ``'hello'``).
67
+
68
+ .. _reStructuredText: http://docutils.sourceforge.net/rst.html
69
+ .. _Sphinx: http://sphinx-doc.org/index.html
70
+ .. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
71
+
72
+
73
+ Code contributions
74
+ ------------------
75
+
76
+ If you intend to contribute code, do not feel the need to sit on your
77
+ contribution until it is perfectly polished and complete. It helps everyone
78
+ involved for you to seek feedback as early as you possibly can. Submitting an
79
+ early, unfinished version of your contribution for feedback can save you from
80
+ putting a lot of work into a contribution that is not suitable for the
81
+ project.
82
+
83
+ Setting up your development environment
84
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
85
+
86
+ You need Python 3.9+ and the following tools:
87
+
88
+ - Poetry_
89
+ - Nox_
90
+ - nox-poetry_
91
+
92
+ Install the package with development requirements:
93
+
94
+ .. code:: console
95
+
96
+ $ poetry install
97
+ $ poetry self add poetry-plugin-export
98
+
99
+ You can now run an interactive Python session.
100
+
101
+ .. code:: console
102
+
103
+ $ poetry run python
104
+
105
+ .. _Poetry: https://python-poetry.org/
106
+ .. _Nox: https://nox.thea.codes/
107
+ .. _nox-poetry: https://nox-poetry.readthedocs.io/
108
+
109
+ Steps for submitting code
110
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
111
+
112
+ When contributing code, you'll want to follow this checklist:
113
+
114
+ 1. Fork the repository on GitHub.
115
+ 2. Run the tests to confirm they all pass on your system. If they don't, you'll
116
+ need to investigate why they fail. If you're unable to diagnose this
117
+ yourself, raise it as a bug report.
118
+ 3. Write tests that demonstrate your bug or feature. Ensure that they fail.
119
+ 4. Make your change.
120
+ 5. Run the entire test suite again, confirming that all tests pass *including
121
+ the ones you just added*.
122
+ 6. Make sure your code follows the code style discussed below.
123
+ 7. Send a GitHub Pull Request to the main repository's ``master`` branch.
124
+ GitHub Pull Requests are the expected method of code collaboration on this
125
+ project.
126
+
127
+ Testing the project
128
+ ~~~~~~~~~~~~~~~~~~~
129
+
130
+ Download the test data:
131
+
132
+ .. code:: console
133
+
134
+ $ poetry run python tests/datasets/download.py
135
+
136
+ Run the full test suite:
137
+
138
+ .. code:: console
139
+
140
+ $ nox
141
+
142
+ List the available Nox sessions:
143
+
144
+ .. code:: console
145
+
146
+ $ nox --list-sessions
147
+
148
+ You can also run a specific Nox session.
149
+ For example, invoke the unit test suite like this:
150
+
151
+ .. code:: console
152
+
153
+ $ nox --session=tests
154
+
155
+ Unit tests are located in the ``tests`` directory,
156
+ and are written using the pytest_ testing framework.
157
+
158
+ .. _pytest: https://pytest.readthedocs.io/
159
+
160
+ Code style
161
+ ~~~~~~~~~~~~
162
+
163
+ The socceraction codebase uses the `PEP 8`_ code style. In addition, we have
164
+ a few guidelines:
165
+
166
+ - Line-length can exceed 79 characters, to 100, when convenient.
167
+ - Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient.
168
+ - Always use double-quoted strings (e.g. ``"soccer"``), unless a double-quote occurs within the string.
169
+
170
+ To ensure all code conforms to this format. You can format the code using the
171
+ pre-commit hooks.
172
+
173
+ .. code:: console
174
+
175
+ $ nox --session=pre-commit
176
+
177
+ Docstrings are to follow the `numpydoc guidelines`_.
178
+
179
+ .. _PEP 8: https://pep8.org/
180
+ .. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html
181
+
182
+ Submitting changes
183
+ ~~~~~~~~~~~~~~~~~~
184
+
185
+ Open a `pull request`_ to submit changes to this project.
186
+
187
+ Your pull request needs to meet the following guidelines for acceptance:
188
+
189
+ - The Nox test suite must pass without errors and warnings.
190
+ - Include unit tests.
191
+ - If your changes add functionality, update the documentation accordingly.
192
+
193
+ Feel free to submit early, though. We can always iterate on this.
194
+
195
+ To run linting and code formatting checks before committing your change, you
196
+ can install pre-commit as a Git hook by running the following command:
197
+
198
+ .. code:: console
199
+
200
+ $ nox --session=pre-commit -- install
201
+
202
+ It is recommended to open an issue before starting work on anything.
203
+
204
+ .. _pull request: https://github.com/ML-KULeuven/socceraction/pulls
205
+ .. github-only
LICENSE.rst ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+ ===========
3
+
4
+ Copyright (c) 2019 KU Leuven Machine Learning Research Group
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ **The software is provided "as is", without warranty of any kind, express or
17
+ implied, including but not limited to the warranties of merchantability,
18
+ fitness for a particular purpose and noninfringement. In no event shall the
19
+ authors or copyright holders be liable for any claim, damages or other
20
+ liability, whether in an action of contract, tort or otherwise, arising from,
21
+ out of or in connection with the software or the use or other dealings in the
22
+ software.**
Makefile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: init test lint pretty notebooks precommit_install bump_major bump_minor bump_patch clean
2
+
3
+ BIN = .venv/bin/
4
+ CODE = socceraction
5
+
6
+ init:
7
+ python3 -m venv .venv
8
+ poetry install
9
+
10
+ tests/datasets/statsbomb/:
11
+ $(BIN)python tests/datasets/download.py --download-statsbomb --convert-statsbomb
12
+
13
+ tests/datasets/wyscout_public/:
14
+ $(BIN)python tests/datasets/download.py --download-wyscout --convert-wyscout
15
+
16
+ tests/datasets/spadl/:
17
+ $(BIN)python tests/datasets/download.py --spadl
18
+
19
+ test: tests/datasets/statsbomb/ tests/datasets/wyscout_public/ tests/datasets/spadl/
20
+ nox -rs tests -- $(args)
21
+
22
+ mypy:
23
+ nox -rs mypy -- $(args)
24
+
25
+ lint:
26
+ nox -rs lint -- $(args)
27
+
28
+ pretty:
29
+ nox -rs pre-commit -- $(args)
30
+
31
+ notebooks:
32
+ $(BIN)python -m nbconvert --execute --inplace --config=default.json public-notebooks/*.ipynb
33
+
34
+ precommit_install:
35
+ nox -rs pre-commit -- install
36
+
37
+ bump_major:
38
+ $(BIN)bumpversion major
39
+
40
+ bump_minor:
41
+ $(BIN)bumpversion minor
42
+
43
+ bump_patch:
44
+ $(BIN)bumpversion patch
45
+
46
+ clean:
47
+ find . -type f -name "*.py[co]" -delete
48
+ find . -type d -name "__pycache__" -delete
49
+ rm -rf tests/datasets/wyscout_public
50
+ rm -rf tests/datasets/statsbomb
README.md CHANGED
@@ -1,12 +1,66 @@
1
- ---
2
- title: Socr
3
- emoji: 🚀
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="https://socceraction.readthedocs.io/en/latest/_static/logo_white.png" height="200">
3
+ <p>
4
+ <b>Convert soccer event stream data to the SPADL format<br/>and value on-the-ball player actions</b>
5
+ </p>
6
+ <br/>
7
+
8
+ [![PyPi](https://img.shields.io/pypi/v/socceraction.svg)](https://pypi.org/project/socceraction)
9
+ [![Python Version: 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg)](https://pypi.org/project/socceraction)
10
+ [![Downloads](https://img.shields.io/pypi/dm/socceraction.svg)](https://pypistats.org/packages/socceraction)
11
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://en.wikipedia.org/wiki/MIT_License)
12
+
13
+ [![Build Status](https://github.com/ML-KULeuven/socceraction/workflows/CI/badge.svg)](https://github.com/ML-KULeuven/socceraction/actions?workflow=CI)
14
+ [![Read the Docs](https://img.shields.io/readthedocs/socceraction/latest.svg?label=Read%20the%20Docs)](https://socceraction.readthedocs.io)
15
+ [![Code coverage](https://codecov.io/gh/ML-KULeuven/socceraction/branch/master/graph/badge.svg)](https://codecov.io/gh/ML-KULeuven/socceraction)
16
+
17
+ <br/>
18
+ <br/>
19
+ </div>
20
+
21
+ Socceraction is a Python package for objectively quantifying the impact of the individual actions performed by soccer players using event stream data. The general idea is to assign a value to each on-the-ball action based on the action's impact on the game outcome, while accounting for the context in which the action happened. The video below gives a quick two-minute introduction to action values.
22
+
23
+ <div align="center">
24
+
25
+ https://user-images.githubusercontent.com/2175271/136857714-1d2c8706-7f2f-449d-818f-0e67fbb75400.mp4
26
+
27
+ </div>
28
+
29
+ ## Features
30
+
31
+ Socceraction contains the following components:
32
+
33
+ - A set of API clients for **loading event stream data** from StatsBomb, Opta, Wyscout, Stats Perform and WhoScored as Pandas DataFrames using a unified data model. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/data/index.html)
34
+ - Converters for each of these provider's proprietary data format to the **SPADL** and **atomic-SPADL** formats, which are unified and expressive languages for on-the-ball player actions. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/spadl/index.html)
35
+ - An implementation of the **Expected Threat (xT)** possession value framework. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/xT.html)
36
+ - An implementation of the **VAEP** and **Atomic-VAEP** possession value frameworks. [Read more »](https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/vaep.html)
37
+
38
+ ## Installation / Getting started
39
+
40
+ The recommended way to install `socceraction` is to simply use pip. The latest version officially supports Python 3.9 - 3.12.
41
+
42
+ ```sh
43
+ $ pip install socceraction
44
+ ```
45
+
46
+ The folder [`public-notebooks`](https://github.com/ML-KULeuven/socceraction/tree/master/public-notebooks) provides a demo of the full pipeline from raw StatsBomb event stream data to action values and player ratings. More detailed installation/usage instructions can be found in the [Documentation](https://socceraction.readthedocs.io/en/latest/).
47
+
48
+ ## Contributing
49
+
50
+ All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. However, be aware that socceraction is not actively developed. It's primary use is to enable reproducibility of our research. If you believe there is a feature missing, feel free to raise a feature request, but please do be aware that the overwhelming likelihood is that your feature request will not be accepted.
51
+ To learn more on how to contribute, see the [Contributor Guide](https://socceraction.readthedocs.io/en/latest/development/developer_guide.html).
52
+
53
+ ## Research
54
+
55
+ If you make use of this package in your research, please consider citing the following papers:
56
+
57
+ - Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis. **Actions speak louder than goals: Valuing player actions in soccer.** In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1851-1861. 2019. <br/>[ [pdf](http://doi.acm.org/10.1145/3292500.3330758) | [bibtex](https://github.com/ML-KULeuven/socceraction/blob/master/docs/_static/decroos19.bibtex) ]
58
+
59
+ - Maaike Van Roy, Pieter Robberechts, Tom Decroos, and Jesse Davis. **Valuing on-the-ball actions in soccer: a critical comparison of XT and VAEP.** In Proceedings of the AAAI-20 Workshop on Artifical Intelligence in Team Sports. AI in Team Sports Organising Committee, 2020. <br/>[ [pdf](https://limo.libis.be/primo-explore/fulldisplay?docid=LIRIAS2913207&context=L&vid=KULeuven&search_scope=ALL_CONTENT&tab=all_content_tab&lang=en_US) | [bibtex](https://github.com/ML-KULeuven/socceraction/blob/master/docs/_static/vanroy20.bibtex) ]
60
+
61
+ The Expected Threat (xT) framework was originally introduced by Karun Singh on his [blog](https://karun.in/blog/expected-threat.html) in 2019.
62
+
63
+ ## License
64
+
65
+ Distributed under the terms of the [MIT license](https://opensource.org/licenses/MIT),
66
+ socceraction is free and open source software. Although not strictly required, we appreciate it if you include a link to this repo or cite our research in your work if you make use of socceraction.
__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SoccerAction
3
+ ~~~~~~~~~~~~
4
+
5
+ SoccerAction is a Python package for objectively quantifying the impact of the
6
+ individual actions performed by soccer players using event stream data.
7
+
8
+ Full documentation is at <https://ml-kuleuven.github.io/socceraction/>.
9
+ :copyright: (c) 2020 by DTAI KU Leuven.
10
+ :license: MIT, see LICENSE for more details.
11
+ """
12
+
13
+ __version__ = "1.5.3"
atomic/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Implements Atomic-SPADL and the Atomic-VAEP framework."""
atomic/spadl/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implementation of the Atomic-SPADL language."""
2
+
3
+ __all__ = [
4
+ "convert_to_atomic",
5
+ "AtomicSPADLSchema",
6
+ "bodyparts_df",
7
+ "actiontypes_df",
8
+ "add_names",
9
+ "play_left_to_right",
10
+ ]
11
+
12
+ from .base import convert_to_atomic
13
+ from .config import actiontypes_df, bodyparts_df
14
+ from .schema import AtomicSPADLSchema
15
+ from .utils import add_names, play_left_to_right
atomic/spadl/base.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements a converter for regular SPADL actions to atomic actions."""
2
+
3
+ from typing import cast
4
+
5
+ import pandas as pd
6
+ from pandera.typing import DataFrame
7
+
8
+ import socceraction.spadl.config as _spadl
9
+ from socceraction.spadl.base import _add_dribbles
10
+ from socceraction.spadl.schema import SPADLSchema
11
+
12
+ from . import config as _atomicspadl
13
+ from .schema import AtomicSPADLSchema
14
+
15
+
16
+ def convert_to_atomic(actions: DataFrame[SPADLSchema]) -> DataFrame[AtomicSPADLSchema]:
17
+ """Convert regular SPADL actions to atomic actions.
18
+
19
+ Parameters
20
+ ----------
21
+ actions : pd.DataFrame
22
+ A SPADL dataframe.
23
+
24
+ Returns
25
+ -------
26
+ pd.DataFrame
27
+ The Atomic-SPADL dataframe.
28
+ """
29
+ atomic_actions = cast(pd.DataFrame, actions.copy())
30
+ atomic_actions = _extra_from_passes(atomic_actions)
31
+ atomic_actions = _add_dribbles(atomic_actions) # for some reason this adds more dribbles
32
+ atomic_actions = _extra_from_shots(atomic_actions)
33
+ atomic_actions = _extra_from_fouls(atomic_actions)
34
+ atomic_actions = _convert_columns(atomic_actions)
35
+ atomic_actions = _simplify(atomic_actions)
36
+ return cast(DataFrame[AtomicSPADLSchema], atomic_actions)
37
+
38
+
39
+ def _extra_from_passes(actions: pd.DataFrame) -> pd.DataFrame:
40
+ next_actions = actions.shift(-1)
41
+ same_team = actions.team_id == next_actions.team_id
42
+
43
+ passlike = [
44
+ "pass",
45
+ "cross",
46
+ "throw_in",
47
+ "freekick_short",
48
+ "freekick_crossed",
49
+ "corner_crossed",
50
+ "corner_short",
51
+ "clearance",
52
+ "goalkick",
53
+ ]
54
+ pass_ids = [_spadl.actiontypes.index(ty) for ty in passlike]
55
+
56
+ interceptionlike = [
57
+ "interception",
58
+ "tackle",
59
+ "keeper_punch",
60
+ "keeper_save",
61
+ "keeper_claim",
62
+ "keeper_pick_up",
63
+ ]
64
+ interception_ids = [_spadl.actiontypes.index(ty) for ty in interceptionlike]
65
+
66
+ samegame = actions.game_id == next_actions.game_id
67
+ sameperiod = actions.period_id == next_actions.period_id
68
+ # samephase = next_actions.time_seconds - actions.time_seconds < max_pass_duration
69
+ extra_idx = (
70
+ actions.type_id.isin(pass_ids)
71
+ & samegame
72
+ & sameperiod # & samephase
73
+ & ~next_actions.type_id.isin(interception_ids)
74
+ )
75
+
76
+ prev = actions[extra_idx]
77
+ nex = next_actions[extra_idx]
78
+
79
+ extra = pd.DataFrame()
80
+ extra["game_id"] = prev.game_id
81
+ extra["original_event_id"] = prev.original_event_id
82
+ extra["period_id"] = prev.period_id
83
+ extra["action_id"] = prev.action_id + 0.1
84
+ extra["time_seconds"] = (prev.time_seconds + nex.time_seconds) / 2
85
+ extra["start_x"] = prev.end_x
86
+ extra["start_y"] = prev.end_y
87
+ extra["end_x"] = prev.end_x
88
+ extra["end_y"] = prev.end_y
89
+ extra["bodypart_id"] = _atomicspadl.bodyparts.index("foot")
90
+ extra["result_id"] = -1
91
+
92
+ offside = prev.result_id == _spadl.results.index("offside")
93
+ out = ((nex.type_id == _atomicspadl.actiontypes.index("goalkick")) & (~same_team)) | (
94
+ nex.type_id == _atomicspadl.actiontypes.index("throw_in")
95
+ )
96
+ ar = _atomicspadl.actiontypes
97
+ extra["type_id"] = -1
98
+ extra["type_id"] = (
99
+ extra.type_id.mask(same_team, ar.index("receival"))
100
+ .mask(~same_team, ar.index("interception"))
101
+ .mask(out, ar.index("out"))
102
+ .mask(offside, ar.index("offside"))
103
+ )
104
+ is_interception = extra["type_id"] == ar.index("interception")
105
+ extra["team_id"] = prev.team_id.mask(is_interception, nex.team_id)
106
+ extra["player_id"] = nex.player_id.mask(out | offside, prev.player_id).astype(
107
+ prev.player_id.dtype
108
+ )
109
+
110
+ actions = pd.concat([actions, extra], ignore_index=True, sort=False)
111
+ actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
112
+ actions["action_id"] = range(len(actions))
113
+ return actions
114
+
115
+
116
+ def _extra_from_shots(actions: pd.DataFrame) -> pd.DataFrame:
117
+ next_actions = actions.shift(-1)
118
+
119
+ shotlike = ["shot", "shot_freekick", "shot_penalty"]
120
+ shot_ids = [_spadl.actiontypes.index(ty) for ty in shotlike]
121
+
122
+ samegame = actions.game_id == next_actions.game_id
123
+ sameperiod = actions.period_id == next_actions.period_id
124
+
125
+ shot = actions.type_id.isin(shot_ids)
126
+ goal = shot & (actions.result_id == _spadl.results.index("success"))
127
+ owngoal = actions.result_id == _spadl.results.index("owngoal")
128
+ next_corner_goalkick = next_actions.type_id.isin(
129
+ [
130
+ _atomicspadl.actiontypes.index("corner_crossed"),
131
+ _atomicspadl.actiontypes.index("corner_short"),
132
+ _atomicspadl.actiontypes.index("goalkick"),
133
+ ]
134
+ )
135
+ out = shot & next_corner_goalkick & samegame & sameperiod
136
+
137
+ extra_idx = goal | owngoal | out
138
+ prev = actions[extra_idx]
139
+ # nex = next_actions[extra_idx]
140
+
141
+ extra = pd.DataFrame()
142
+ extra["game_id"] = prev.game_id
143
+ extra["original_event_id"] = prev.original_event_id
144
+ extra["period_id"] = prev.period_id
145
+ extra["action_id"] = prev.action_id + 0.1
146
+ extra["time_seconds"] = prev.time_seconds # + nex.time_seconds) / 2
147
+ extra["start_x"] = prev.end_x
148
+ extra["start_y"] = prev.end_y
149
+ extra["end_x"] = prev.end_x
150
+ extra["end_y"] = prev.end_y
151
+ extra["bodypart_id"] = prev.bodypart_id
152
+ extra["result_id"] = -1
153
+ extra["team_id"] = prev.team_id
154
+ extra["player_id"] = prev.player_id
155
+
156
+ ar = _atomicspadl.actiontypes
157
+ extra["type_id"] = -1
158
+ extra["type_id"] = (
159
+ extra.type_id.mask(out, ar.index("out"))
160
+ .mask(goal, ar.index("goal"))
161
+ .mask(owngoal, ar.index("owngoal"))
162
+ )
163
+ actions = pd.concat([actions, extra], ignore_index=True, sort=False)
164
+ actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
165
+ actions["action_id"] = range(len(actions))
166
+ return actions
167
+
168
+
169
+ def _extra_from_fouls(actions: pd.DataFrame) -> pd.DataFrame:
170
+ yellow = actions.result_id == _spadl.results.index("yellow_card")
171
+ red = actions.result_id == _spadl.results.index("red_card")
172
+
173
+ prev = actions[yellow | red]
174
+ extra = pd.DataFrame()
175
+ extra["game_id"] = prev.game_id
176
+ extra["original_event_id"] = prev.original_event_id
177
+ extra["period_id"] = prev.period_id
178
+ extra["action_id"] = prev.action_id + 0.1
179
+ extra["time_seconds"] = prev.time_seconds # + nex.time_seconds) / 2
180
+ extra["start_x"] = prev.end_x
181
+ extra["start_y"] = prev.end_y
182
+ extra["end_x"] = prev.end_x
183
+ extra["end_y"] = prev.end_y
184
+ extra["bodypart_id"] = prev.bodypart_id
185
+ extra["result_id"] = -1
186
+ extra["team_id"] = prev.team_id
187
+ extra["player_id"] = prev.player_id
188
+
189
+ ar = _atomicspadl.actiontypes
190
+ extra["type_id"] = -1
191
+ extra["type_id"] = extra.type_id.mask(yellow, ar.index("yellow_card")).mask(
192
+ red, ar.index("red_card")
193
+ )
194
+ actions = pd.concat([actions, extra], ignore_index=True, sort=False)
195
+ actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(drop=True)
196
+ actions["action_id"] = range(len(actions))
197
+ return actions
198
+
199
+
200
+ def _convert_columns(actions: pd.DataFrame) -> pd.DataFrame:
201
+ actions["x"] = actions.start_x
202
+ actions["y"] = actions.start_y
203
+ actions["dx"] = actions.end_x - actions.start_x
204
+ actions["dy"] = actions.end_y - actions.start_y
205
+ return actions[
206
+ [
207
+ "game_id",
208
+ "original_event_id",
209
+ "action_id",
210
+ "period_id",
211
+ "time_seconds",
212
+ "team_id",
213
+ "player_id",
214
+ "x",
215
+ "y",
216
+ "dx",
217
+ "dy",
218
+ "type_id",
219
+ "bodypart_id",
220
+ ]
221
+ ]
222
+
223
+
224
+ def _simplify(actions: pd.DataFrame) -> pd.DataFrame:
225
+ a = actions
226
+ ar = _atomicspadl.actiontypes
227
+
228
+ cornerlike = ["corner_crossed", "corner_short"]
229
+ corner_ids = [_spadl.actiontypes.index(ty) for ty in cornerlike]
230
+
231
+ freekicklike = ["freekick_crossed", "freekick_short", "shot_freekick"]
232
+ freekick_ids = [_spadl.actiontypes.index(ty) for ty in freekicklike]
233
+
234
+ a["type_id"] = a.type_id.mask(a.type_id.isin(corner_ids), ar.index("corner"))
235
+ a["type_id"] = a.type_id.mask(a.type_id.isin(freekick_ids), ar.index("freekick"))
236
+ return a
atomic/spadl/config.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration of the Atomic-SPADL language.
2
+
3
+ Attributes
4
+ ----------
5
+ field_length : float
6
+ The length of a pitch (in meters).
7
+ field_width : float
8
+ The width of a pitch (in meters).
9
+ bodyparts : list(str)
10
+ The bodyparts used in the Atomic-SPADL language.
11
+ actiontypes : list(str)
12
+ The action types used in the Atomic-SPADL language.
13
+
14
+ """
15
+
16
+ import pandas as pd
17
+
18
+ import socceraction.spadl.config as _spadl
19
+
20
+ field_length = _spadl.field_length
21
+ field_width = _spadl.field_width
22
+
23
+ bodyparts = _spadl.bodyparts
24
+ bodyparts_df = _spadl.bodyparts_df
25
+
26
+ actiontypes = _spadl.actiontypes + [
27
+ "receival",
28
+ "interception",
29
+ "out",
30
+ "offside",
31
+ "goal",
32
+ "owngoal",
33
+ "yellow_card",
34
+ "red_card",
35
+ "corner",
36
+ "freekick",
37
+ ]
38
+
39
+
40
+ def actiontypes_df() -> pd.DataFrame:
41
+ """Return a dataframe with the type id and type name of each Atomic-SPADL action type.
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ The 'type_id' and 'type_name' of each Atomic-SPADL action type.
47
+ """
48
+ return pd.DataFrame(list(enumerate(actiontypes)), columns=["type_id", "type_name"])
atomic/spadl/schema.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Schema for Atomic-SPADL actions."""
2
+
3
+ from typing import Any, Optional
4
+
5
+ import pandera as pa
6
+ from pandera.typing import Series
7
+
8
+ from . import config as spadlconfig
9
+
10
+
11
+ class AtomicSPADLSchema(pa.SchemaModel):
12
+ """Definition of an Atomic-SPADL dataframe."""
13
+
14
+ game_id: Series[Any] = pa.Field()
15
+ original_event_id: Series[Any] = pa.Field(nullable=True)
16
+ action_id: Series[int] = pa.Field()
17
+ period_id: Series[int] = pa.Field(ge=1, le=5)
18
+ time_seconds: Series[float] = pa.Field(ge=0)
19
+ team_id: Series[Any] = pa.Field()
20
+ player_id: Series[Any] = pa.Field()
21
+ x: Series[float] = pa.Field(ge=0, le=spadlconfig.field_length)
22
+ y: Series[float] = pa.Field(ge=0, le=spadlconfig.field_width)
23
+ dx: Series[float] = pa.Field(ge=-spadlconfig.field_length, le=spadlconfig.field_length)
24
+ dy: Series[float] = pa.Field(ge=-spadlconfig.field_width, le=spadlconfig.field_width)
25
+ bodypart_id: Series[int] = pa.Field(isin=spadlconfig.bodyparts_df().bodypart_id)
26
+ bodypart_name: Optional[Series[str]] = pa.Field(isin=spadlconfig.bodyparts_df().bodypart_name)
27
+ type_id: Series[int] = pa.Field(isin=spadlconfig.actiontypes_df().type_id)
28
+ type_name: Optional[Series[str]] = pa.Field(isin=spadlconfig.actiontypes_df().type_name)
29
+
30
+ class Config: # noqa: D106
31
+ strict = True
32
+ coerce = True
atomic/spadl/utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for working with Atomic-SPADL dataframes."""
2
+
3
+ from typing import cast
4
+
5
+ from pandera.typing import DataFrame
6
+
7
+ from . import config as spadlconfig
8
+ from .schema import AtomicSPADLSchema
9
+
10
+
11
+ def add_names(actions: DataFrame[AtomicSPADLSchema]) -> DataFrame[AtomicSPADLSchema]:
12
+ """Add the type name, result name and bodypart name to an Atomic-SPADL dataframe.
13
+
14
+ Parameters
15
+ ----------
16
+ actions : pd.DataFrame
17
+ An Atomic-SPADL dataframe.
18
+
19
+ Returns
20
+ -------
21
+ pd.DataFrame
22
+ The original dataframe with a 'type_name', 'result_name' and
23
+ 'bodypart_name' appended.
24
+ """
25
+ return cast(
26
+ DataFrame[AtomicSPADLSchema],
27
+ actions.drop(columns=["type_name", "bodypart_name"], errors="ignore")
28
+ .merge(spadlconfig.actiontypes_df(), how="left")
29
+ .merge(spadlconfig.bodyparts_df(), how="left")
30
+ .set_index(actions.index),
31
+ )
32
+
33
+
34
+ def play_left_to_right(
35
+ actions: DataFrame[AtomicSPADLSchema], home_team_id: int
36
+ ) -> DataFrame[AtomicSPADLSchema]:
37
+ """Perform all action in the same playing direction.
38
+
39
+ This changes the location of each action, such that all actions
40
+ are performed as if the team that executes the action plays from left to
41
+ right.
42
+
43
+ Parameters
44
+ ----------
45
+ actions : pd.DataFrame
46
+ The SPADL actins of a game.
47
+ home_team_id : int
48
+ The ID of the home team.
49
+
50
+ Returns
51
+ -------
52
+ list(pd.DataFrame)
53
+ All actions performed left to right.
54
+
55
+ See Also
56
+ --------
57
+ socceraction.atomic.vaep.features.play_left_to_right : For transforming gamestates.
58
+ """
59
+ ltr_actions = actions.copy()
60
+ away_idx = actions.team_id != home_team_id
61
+ ltr_actions.loc[away_idx, "x"] = spadlconfig.field_length - actions[away_idx]["x"].values
62
+ ltr_actions.loc[away_idx, "y"] = spadlconfig.field_width - actions[away_idx]["y"].values
63
+ ltr_actions.loc[away_idx, "dx"] = -actions[away_idx]["dx"].values
64
+ ltr_actions.loc[away_idx, "dy"] = -actions[away_idx]["dy"].values
65
+ return ltr_actions
atomic/vaep/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Implements the Atomic-VAEP framework."""
2
+
3
+ from . import features, formula, labels
4
+ from .base import AtomicVAEP
5
+
6
+ __all__ = ["AtomicVAEP", "features", "labels", "formula"]
atomic/vaep/base.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the Atomic-VAEP framework.
2
+
3
+ Attributes
4
+ ----------
5
+ xfns_default : list(callable)
6
+ The default VAEP features.
7
+
8
+ """
9
+
10
+ from typing import Optional
11
+
12
+ import socceraction.atomic.spadl as spadlcfg
13
+ from socceraction.vaep.base import VAEP
14
+
15
+ from . import features as fs
16
+ from . import formula as vaep
17
+ from . import labels as lab
18
+
19
+ xfns_default = [
20
+ fs.actiontype,
21
+ fs.actiontype_onehot,
22
+ fs.bodypart,
23
+ fs.bodypart_onehot,
24
+ fs.time,
25
+ fs.team,
26
+ fs.time_delta,
27
+ fs.location,
28
+ fs.polar,
29
+ fs.movement_polar,
30
+ fs.direction,
31
+ fs.goalscore,
32
+ ]
33
+
34
+
35
+ class AtomicVAEP(VAEP):
36
+ """
37
+ An implementation of the VAEP framework for atomic actions.
38
+
39
+ In contrast to the original VAEP framework [1]_ this extension
40
+ distinguishes the contribution of the player who initiates the action
41
+ (e.g., gives the pass) and the player who completes the action (e.g.,
42
+ receives the pass) [2]_.
43
+
44
+ Parameters
45
+ ----------
46
+ xfns : list
47
+ List of feature transformers (see :mod:`socceraction.atomic.vaep.features`)
48
+ used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default`
49
+ if None.
50
+ nb_prev_actions : int, default=3
51
+ Number of previous actions used to decscribe the game state.
52
+
53
+ See Also
54
+ --------
55
+ :class:`socceraction.vaep.VAEP` : Implementation of the original VAEP framework.
56
+
57
+ References
58
+ ----------
59
+ .. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis.
60
+ "Actions speak louder than goals: Valuing player actions in soccer." In
61
+ Proceedings of the 25th ACM SIGKDD International Conference on Knowledge
62
+ Discovery & Data Mining, pp. 1851-1861. 2019.
63
+ .. [2] Tom Decroos, Pieter Robberechts and Jesse Davis.
64
+ "Introducing Atomic-SPADL: A New Way to Represent Event Stream Data".
65
+ DTAI Sports Analytics Blog. https://dtai.cs.kuleuven.be/sports/blog/introducing-atomic-spadl:-a-new-way-to-represent-event-stream-data # noqa
66
+ May 2020.
67
+ """
68
+
69
+ _spadlcfg = spadlcfg
70
+ _lab = lab
71
+ _fs = fs
72
+ _vaep = vaep
73
+
74
+ def __init__(
75
+ self,
76
+ xfns: Optional[list[fs.FeatureTransfomer]] = None,
77
+ nb_prev_actions: int = 3,
78
+ ) -> None:
79
+ xfns = xfns_default if xfns is None else xfns
80
+ super().__init__(xfns, nb_prev_actions)
atomic/vaep/features.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the feature tranformers of the VAEP framework."""
2
+
3
+ from typing import Any, Callable, Union
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from pandera.typing import DataFrame
8
+
9
+ import socceraction.atomic.spadl.config as atomicspadl
10
+ from socceraction.atomic.spadl import AtomicSPADLSchema
11
+ from socceraction.spadl import SPADLSchema
12
+ from socceraction.vaep.features import (
13
+ actiontype,
14
+ bodypart,
15
+ bodypart_detailed,
16
+ bodypart_detailed_onehot,
17
+ bodypart_onehot,
18
+ gamestates,
19
+ player_possession_time,
20
+ simple,
21
+ speed,
22
+ team,
23
+ time,
24
+ time_delta,
25
+ )
26
+
27
+ __all__ = [
28
+ "feature_column_names",
29
+ "play_left_to_right",
30
+ "gamestates",
31
+ "actiontype",
32
+ "actiontype_onehot",
33
+ "bodypart",
34
+ "bodypart_detailed",
35
+ "bodypart_onehot",
36
+ "bodypart_detailed_onehot",
37
+ "team",
38
+ "time",
39
+ "time_delta",
40
+ "speed",
41
+ "location",
42
+ "polar",
43
+ "movement_polar",
44
+ "direction",
45
+ "goalscore",
46
+ "player_possession_time",
47
+ ]
48
+
49
+ Actions = Union[DataFrame[SPADLSchema], DataFrame[AtomicSPADLSchema]]
50
+ GameStates = list[Actions]
51
+ Features = DataFrame[Any]
52
+ FeatureTransfomer = Callable[[GameStates], Features]
53
+
54
+
55
+ def feature_column_names(fs: list[FeatureTransfomer], nb_prev_actions: int = 3) -> list[str]:
56
+ """Return the names of the features generated by a list of transformers.
57
+
58
+ Parameters
59
+ ----------
60
+ fs : list(callable)
61
+ A list of feature transformers.
62
+ nb_prev_actions : int, default=3 # noqa: DAR103
63
+ The number of previous actions included in the game state.
64
+
65
+ Returns
66
+ -------
67
+ list(str)
68
+ The name of each generated feature.
69
+ """
70
+ spadlcolumns = [
71
+ "game_id",
72
+ "original_event_id",
73
+ "action_id",
74
+ "period_id",
75
+ "time_seconds",
76
+ "team_id",
77
+ "player_id",
78
+ "x",
79
+ "y",
80
+ "dx",
81
+ "dy",
82
+ "bodypart_id",
83
+ "bodypart_name",
84
+ "type_id",
85
+ "type_name",
86
+ ]
87
+ dummy_actions = pd.DataFrame(np.zeros((10, len(spadlcolumns))), columns=spadlcolumns)
88
+ for c in spadlcolumns:
89
+ if "name" in c:
90
+ dummy_actions[c] = dummy_actions[c].astype(str)
91
+ gs = gamestates(dummy_actions, nb_prev_actions) # type: ignore
92
+ return list(pd.concat([f(gs) for f in fs], axis=1).columns)
93
+
94
+
95
+ def play_left_to_right(gamestates: GameStates, home_team_id: int) -> GameStates:
96
+ """Perform all action in the same playing direction.
97
+
98
+ This changes the start and end location of each action, such that all actions
99
+ are performed as if the team plays from left to right.
100
+
101
+ Parameters
102
+ ----------
103
+ gamestates : GameStates
104
+ The game states of a game.
105
+ home_team_id : int
106
+ The ID of the home team.
107
+
108
+ Returns
109
+ -------
110
+ list(pd.DataFrame)
111
+ The game states with all actions performed left to right.
112
+ """
113
+ a0 = gamestates[0]
114
+ away_idx = a0.team_id != home_team_id
115
+ for actions in gamestates:
116
+ actions.loc[away_idx, "x"] = atomicspadl.field_length - actions[away_idx]["x"].values
117
+ actions.loc[away_idx, "y"] = atomicspadl.field_width - actions[away_idx]["y"].values
118
+ actions.loc[away_idx, "dx"] = -actions[away_idx]["dx"].values
119
+ actions.loc[away_idx, "dy"] = -actions[away_idx]["dy"].values
120
+ return gamestates
121
+
122
+
123
+ @simple
124
+ def actiontype_onehot(actions: Actions) -> Features:
125
+ """Get the one-hot-encoded type of each action.
126
+
127
+ Parameters
128
+ ----------
129
+ actions : Actions
130
+ The actions of a game.
131
+
132
+ Returns
133
+ -------
134
+ Features
135
+ A one-hot encoding of each action's type.
136
+ """
137
+ X = {}
138
+ for type_id, type_name in enumerate(atomicspadl.actiontypes):
139
+ col = "actiontype_" + type_name
140
+ X[col] = actions["type_id"] == type_id
141
+ return pd.DataFrame(X, index=actions.index)
142
+
143
+
144
+ @simple
145
+ def location(actions: Actions) -> Features:
146
+ """Get the location where each action started.
147
+
148
+ Parameters
149
+ ----------
150
+ actions : Actions
151
+ The actions of a game.
152
+
153
+ Returns
154
+ -------
155
+ Features
156
+ The 'x' and 'y' location of each action.
157
+ """
158
+ return actions[["x", "y"]]
159
+
160
+
161
+ _goal_x = atomicspadl.field_length
162
+ _goal_y = atomicspadl.field_width / 2
163
+
164
+
165
+ @simple
166
+ def polar(actions: Actions) -> Features:
167
+ """Get the polar coordinates of each action's start location.
168
+
169
+ The center of the opponent's goal is used as the origin.
170
+
171
+ Parameters
172
+ ----------
173
+ actions : Actions
174
+ The actions of a game.
175
+
176
+ Returns
177
+ -------
178
+ Features
179
+ The 'dist_to_goal' and 'angle_to_goal' of each action.
180
+ """
181
+ polardf = pd.DataFrame(index=actions.index)
182
+ dx = (_goal_x - actions["x"]).abs().values
183
+ dy = (_goal_y - actions["y"]).abs().values
184
+ polardf["dist_to_goal"] = np.sqrt(dx**2 + dy**2)
185
+ with np.errstate(divide="ignore", invalid="ignore"):
186
+ polardf["angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
187
+ return polardf
188
+
189
+
190
+ @simple
191
+ def movement_polar(actions: Actions) -> Features:
192
+ """Get the distance covered and direction of each action.
193
+
194
+ Parameters
195
+ ----------
196
+ actions : Actions
197
+ The actions of a game.
198
+
199
+ Returns
200
+ -------
201
+ Features
202
+ The distance covered ('mov_d') and direction ('mov_angle') of each action.
203
+ """
204
+ mov = pd.DataFrame(index=actions.index)
205
+ mov["mov_d"] = np.sqrt(actions.dx**2 + actions.dy**2)
206
+ with np.errstate(divide="ignore", invalid="ignore"):
207
+ mov["mov_angle"] = np.arctan2(actions.dy, actions.dx)
208
+ mov.loc[actions.dy == 0, "mov_angle"] = 0 # fix float errors
209
+ return mov
210
+
211
+
212
+ @simple
213
+ def direction(actions: Actions) -> Features:
214
+ """Get the direction of the action as components of the unit vector.
215
+
216
+ Parameters
217
+ ----------
218
+ actions : Actions
219
+ The actions of a game.
220
+
221
+ Returns
222
+ -------
223
+ Features
224
+ The x-component ('dx') and y-compoment ('mov_angle') of the unit
225
+ vector of each action.
226
+ """
227
+ mov = pd.DataFrame(index=actions.index)
228
+ totald = np.sqrt(actions.dx**2 + actions.dy**2)
229
+ for d in ["dx", "dy"]:
230
+ # we don't want to give away the end location,
231
+ # just the direction of the ball
232
+ # We also don't want to divide by zero
233
+ mov[d] = actions[d].mask(totald > 0, actions[d] / totald)
234
+
235
+ return mov
236
+
237
+
238
+ def goalscore(gamestates: GameStates) -> Features:
239
+ """Get the number of goals scored by each team after the action.
240
+
241
+ Parameters
242
+ ----------
243
+ gamestates : GameStates
244
+ The gamestates of a game.
245
+
246
+ Returns
247
+ -------
248
+ Features
249
+ The number of goals scored by the team performing the last action of the
250
+ game state ('goalscore_team'), by the opponent ('goalscore_opponent'),
251
+ and the goal difference between both teams ('goalscore_diff').
252
+ """
253
+ actions = gamestates[0]
254
+ teamA = actions["team_id"].values[0]
255
+ goals = actions.type_name == "goal"
256
+ owngoals = actions["type_name"].str.contains("owngoal")
257
+
258
+ teamisA = actions["team_id"] == teamA
259
+ teamisB = ~teamisA
260
+ goalsteamA = (goals & teamisA) | (owngoals & teamisB)
261
+ goalsteamB = (goals & teamisB) | (owngoals & teamisA)
262
+ goalscoreteamA = goalsteamA.cumsum() - goalsteamA
263
+ goalscoreteamB = goalsteamB.cumsum() - goalsteamB
264
+
265
+ scoredf = pd.DataFrame(index=actions.index)
266
+ scoredf["goalscore_team"] = (goalscoreteamA * teamisA) + (goalscoreteamB * teamisB)
267
+ scoredf["goalscore_opponent"] = (goalscoreteamB * teamisA) + (goalscoreteamA * teamisB)
268
+ scoredf["goalscore_diff"] = scoredf["goalscore_team"] - scoredf["goalscore_opponent"]
269
+ return scoredf
atomic/vaep/formula.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the formula of the Atomic-VAEP framework."""
2
+
3
+ import pandas as pd
4
+ from pandera.typing import DataFrame, Series
5
+
6
+ from socceraction.atomic.spadl import AtomicSPADLSchema
7
+
8
+
9
+ def _prev(x: pd.Series) -> pd.Series:
10
+ prev_x = x.shift(1)
11
+ prev_x[:1] = x.values[0]
12
+ return prev_x
13
+
14
+
15
+ def offensive_value(
16
+ actions: DataFrame[AtomicSPADLSchema], scores: Series[float], concedes: Series[float]
17
+ ) -> Series[float]:
18
+ r"""Compute the offensive value of each action.
19
+
20
+ VAEP defines the *offensive value* of an action as the change in scoring
21
+ probability before and after the action.
22
+
23
+ .. math::
24
+
25
+ \Delta P_{score}(a_{i}, t) = P^{k}_{score}(S_i, t) - P^{k}_{score}(S_{i-1}, t)
26
+
27
+ where :math:`P_{score}(S_i, t)` is the probability that team :math:`t`
28
+ which possesses the ball in state :math:`S_i` will score in the next 10
29
+ actions.
30
+
31
+ Parameters
32
+ ----------
33
+ actions : pd.DataFrame
34
+ SPADL action.
35
+ scores : pd.Series
36
+ The probability of scoring from each corresponding game state.
37
+ concedes : pd.Series
38
+ The probability of conceding from each corresponding game state.
39
+
40
+ Returns
41
+ -------
42
+ pd.Series
43
+ he ffensive value of each action.
44
+ """
45
+ sameteam = _prev(actions.team_id) == actions.team_id
46
+ prev_scores = _prev(scores) * sameteam + _prev(concedes) * (~sameteam)
47
+
48
+ # if the previous action was too long ago, the odds of scoring are now 0
49
+ # toolong_idx = (
50
+ # abs(actions.time_seconds - _prev(actions.time_seconds)) > _samephase_nb
51
+ # )
52
+ # prev_scores[toolong_idx] = 0
53
+
54
+ # if the previous action was a goal, the odds of scoring are now 0
55
+ prevgoal_idx = _prev(actions.type_name).isin(["goal", "owngoal"])
56
+ prev_scores[prevgoal_idx] = 0
57
+
58
+ return scores - prev_scores
59
+
60
+
61
+ def defensive_value(
62
+ actions: DataFrame[AtomicSPADLSchema], scores: Series[float], concedes: Series[float]
63
+ ) -> Series[float]:
64
+ r"""Compute the defensive value of each action.
65
+
66
+ VAEP defines the *defensive value* of an action as the change in conceding
67
+ probability.
68
+
69
+ .. math::
70
+
71
+ \Delta P_{concede}(a_{i}, t) = P^{k}_{concede}(S_i, t) - P^{k}_{concede}(S_{i-1}, t)
72
+
73
+ where :math:`P_{concede}(S_i, t)` is the probability that team :math:`t`
74
+ which possesses the ball in state :math:`S_i` will concede in the next 10
75
+ actions.
76
+
77
+ Parameters
78
+ ----------
79
+ actions : pd.DataFrame
80
+ SPADL action.
81
+ scores : pd.Series
82
+ The probability of scoring from each corresponding game state.
83
+ concedes : pd.Series
84
+ The probability of conceding from each corresponding game state.
85
+
86
+ Returns
87
+ -------
88
+ pd.Series
89
+ The defensive value of each action.
90
+ """
91
+ sameteam = _prev(actions.team_id) == actions.team_id
92
+ prev_concedes = _prev(concedes) * sameteam + _prev(scores) * (~sameteam)
93
+
94
+ # if the previous action was too long ago, the odds of scoring are now 0
95
+ # toolong_idx = (
96
+ # abs(actions.time_seconds - _prev(actions.time_seconds)) > _samephase_nb
97
+ # )
98
+ # prev_concedes[toolong_idx] = 0
99
+
100
+ # if the previous action was a goal, the odds of conceding are now 0
101
+ prevgoal_idx = _prev(actions.type_name).isin(["goal", "owngoal"])
102
+ prev_concedes[prevgoal_idx] = 0
103
+
104
+ return -(concedes - prev_concedes)
105
+
106
+
107
+ def value(
108
+ actions: DataFrame[AtomicSPADLSchema], Pscores: Series[float], Pconcedes: Series[float]
109
+ ) -> pd.DataFrame:
110
+ r"""Compute the offensive, defensive and VAEP value of each action.
111
+
112
+ The total VAEP value of an action is the difference between that action's
113
+ offensive value and defensive value.
114
+
115
+ .. math::
116
+
117
+ V_{VAEP}(a_i) = \Delta P_{score}(a_{i}, t) - \Delta P_{concede}(a_{i}, t)
118
+
119
+ Parameters
120
+ ----------
121
+ actions : pd.DataFrame
122
+ SPADL action.
123
+ Pscores : pd.Series
124
+ The probability of scoring from each corresponding game state.
125
+ Pconcedes : pd.Series
126
+ The probability of conceding from each corresponding game state.
127
+
128
+ Returns
129
+ -------
130
+ pd.DataFrame
131
+ The 'offensive_value', 'defensive_value' and 'vaep_value' of each action.
132
+
133
+ See Also
134
+ --------
135
+ :func:`~socceraction.vaep.formula.offensive_value`: The offensive value
136
+ :func:`~socceraction.vaep.formula.defensive_value`: The defensive value
137
+ """
138
+ v = pd.DataFrame()
139
+ v["offensive_value"] = offensive_value(actions, Pscores, Pconcedes)
140
+ v["defensive_value"] = defensive_value(actions, Pscores, Pconcedes)
141
+ v["vaep_value"] = v["offensive_value"] + v["defensive_value"]
142
+ return v
atomic/vaep/labels.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the label tranformers of the Atomic-VAEP framework."""
2
+
3
+ import pandas as pd
4
+ from pandera.typing import DataFrame
5
+
6
+ import socceraction.atomic.spadl.config as atomicspadl
7
+ from socceraction.atomic.spadl import AtomicSPADLSchema
8
+
9
+
10
+ def scores(actions: DataFrame[AtomicSPADLSchema], nr_actions: int = 10) -> pd.DataFrame:
11
+ """Determine whether the team possessing the ball scored a goal within the next x actions.
12
+
13
+ Parameters
14
+ ----------
15
+ actions : pd.DataFrame
16
+ The actions of a game.
17
+ nr_actions : int, default=10 # noqa: DAR103
18
+ Number of actions after the current action to consider.
19
+
20
+ Returns
21
+ -------
22
+ pd.DataFrame
23
+ A dataframe with a column 'scores' and a row for each action set to
24
+ True if a goal was scored by the team possessing the ball within the
25
+ next x actions; otherwise False.
26
+ """
27
+ # merging goals, owngoals and team_ids
28
+ goals = actions["type_id"] == atomicspadl.actiontypes.index("goal")
29
+ owngoals = actions["type_id"] == atomicspadl.actiontypes.index("owngoal")
30
+ y = pd.concat([goals, owngoals, actions["team_id"]], axis=1)
31
+ y.columns = ["goal", "owngoal", "team_id"]
32
+
33
+ # adding future results
34
+ for i in range(1, nr_actions):
35
+ for c in ["team_id", "goal", "owngoal"]:
36
+ shifted = y[c].shift(-i)
37
+ shifted[-i:] = y[c].iloc[len(y) - 1]
38
+ y["%s+%d" % (c, i)] = shifted
39
+
40
+ res = y["goal"]
41
+ for i in range(1, nr_actions):
42
+ gi = y["goal+%d" % i] & (y["team_id+%d" % i] == y["team_id"])
43
+ ogi = y["owngoal+%d" % i] & (y["team_id+%d" % i] != y["team_id"])
44
+ res = res | gi | ogi
45
+
46
+ return pd.DataFrame(res, columns=["scores"])
47
+
48
+
49
+ def concedes(actions: DataFrame[AtomicSPADLSchema], nr_actions: int = 10) -> pd.DataFrame:
50
+ """Determine whether the team possessing the ball conceded a goal within the next x actions.
51
+
52
+ Parameters
53
+ ----------
54
+ actions : pd.DataFrame
55
+ The actions of a game.
56
+ nr_actions : int, default=10 # noqa: DAR103
57
+ Number of actions after the current action to consider.
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ A dataframe with a column 'concedes' and a row for each action set to
63
+ True if a goal was conceded by the team possessing the ball within the
64
+ next x actions; otherwise False.
65
+ """
66
+ # merging goals, owngoals and team_ids
67
+ goals = actions["type_id"] == atomicspadl.actiontypes.index("goal")
68
+ owngoals = actions["type_id"] == atomicspadl.actiontypes.index("owngoal")
69
+ y = pd.concat([goals, owngoals, actions["team_id"]], axis=1)
70
+ y.columns = ["goal", "owngoal", "team_id"]
71
+
72
+ # adding future results
73
+ for i in range(1, nr_actions):
74
+ for c in ["team_id", "goal", "owngoal"]:
75
+ shifted = y[c].shift(-i)
76
+ shifted[-i:] = y[c].iloc[len(y) - 1]
77
+ y["%s+%d" % (c, i)] = shifted
78
+
79
+ res = y["owngoal"]
80
+ for i in range(1, nr_actions):
81
+ gi = y["goal+%d" % i] & (y["team_id+%d" % i] != y["team_id"])
82
+ ogi = y["owngoal+%d" % i] & (y["team_id+%d" % i] == y["team_id"])
83
+ res = res | gi | ogi
84
+
85
+ return pd.DataFrame(res, columns=["concedes"])
86
+
87
+
88
+ def goal_from_shot(actions: DataFrame[AtomicSPADLSchema]) -> pd.DataFrame:
89
+ """Determine whether a goal was scored from the current action.
90
+
91
+ This label can be use to train an xG model.
92
+
93
+ Parameters
94
+ ----------
95
+ actions : pd.DataFrame
96
+ The actions of a game.
97
+
98
+ Returns
99
+ -------
100
+ pd.DataFrame
101
+ A dataframe with a column 'goal' and a row for each action set to
102
+ True if a goal was scored from the current action; otherwise False.
103
+ """
104
+ goals = (actions["type_id"] == atomicspadl.actiontypes.index("shot")) & (
105
+ actions["type_id"].shift(-1) == atomicspadl.actiontypes.index("goal")
106
+ )
107
+
108
+ return pd.DataFrame(goals.rename("goal"))
data/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Implements serializers for the event data of various providers."""
2
+
3
+ __all__ = [
4
+ "opta",
5
+ "statsbomb",
6
+ "wyscout",
7
+ ]
8
+
9
+ from . import opta, statsbomb, wyscout
data/base.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base class and utility functions for all event stream data serializers.
2
+
3
+ A serializer should extend the 'EventDataLoader' class to (down)load event
4
+ stream data.
5
+ """
6
+
7
+ import base64
8
+ import json
9
+ import warnings
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any, Union
12
+ from urllib import request
13
+
14
+ from pandera.typing import DataFrame
15
+
16
+ JSONType = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
17
+
18
+
19
+ class ParseError(Exception):
20
+ """Exception raised when a file is not correctly formatted."""
21
+
22
+
23
+ class MissingDataError(Exception):
24
+ """Exception raised when a field is missing in the input data."""
25
+
26
+
27
+ class NoAuthWarning(UserWarning):
28
+ """Warning raised when no user credentials are provided."""
29
+
30
+
31
+ def _remoteloadjson(path: str) -> JSONType:
32
+ """Load JSON data from a URL.
33
+
34
+ Parameters
35
+ ----------
36
+ path : str
37
+ URL of the data source.
38
+
39
+ Returns
40
+ -------
41
+ JSONType
42
+ A dictionary with the loaded JSON data.
43
+ """
44
+ return json.loads(request.urlopen(path).read())
45
+
46
+
47
+ def _auth_remoteloadjson(user: str, passwd: str) -> None:
48
+ """Add a Authorization header to all requests.
49
+
50
+ Parameters
51
+ ----------
52
+ user : str
53
+ Username.
54
+ passwd : str
55
+ Password.
56
+ """
57
+ auth = base64.b64encode(f"{user}:{passwd}".encode())
58
+ opener = request.build_opener()
59
+ opener.addheaders = [("Authorization", f"Basic {auth.decode()}")]
60
+ request.install_opener(opener)
61
+
62
+
63
+ def _localloadjson(path: str) -> JSONType:
64
+ """Load a dictionary from a JSON's filepath.
65
+
66
+ Parameters
67
+ ----------
68
+ path : str
69
+ JSON's filepath.
70
+
71
+ Returns
72
+ -------
73
+ JSONType
74
+ A dictionary with the data loaded.
75
+ """
76
+ with open(path, encoding="utf-8") as fh:
77
+ return json.load(fh)
78
+
79
+
80
+ def _has_auth(creds: dict[str, str]) -> bool:
81
+ """Check if user credentials are provided.
82
+
83
+ Parameters
84
+ ----------
85
+ creds : dict
86
+ A dictionary with user credentials. It should contain "user" and
87
+ "passwd" keys.
88
+
89
+ Returns
90
+ -------
91
+ bool
92
+ True if user credentials are provided, False otherwise.
93
+ """
94
+ if creds.get("user") in [None, ""] or creds.get("passwd") in [None, ""]:
95
+ warnings.warn("Credentials were not supplied. Public data access only.", NoAuthWarning)
96
+ return False
97
+ return True
98
+
99
+
100
+ def _expand_minute(minute: int, periods_duration: list[int]) -> int:
101
+ """Expand a timestamp with injury time of previous periods.
102
+
103
+ Parameters
104
+ ----------
105
+ minute : int
106
+ Timestamp in minutes.
107
+ periods_duration : List[int]
108
+ Total duration of each period in minutes.
109
+
110
+ Returns
111
+ -------
112
+ int
113
+ Timestamp expanded with injury time.
114
+ """
115
+ expanded_minute = minute
116
+ periods_regular = [45, 45, 15, 15, 0]
117
+ for period in range(len(periods_duration) - 1):
118
+ if minute > sum(periods_regular[: period + 1]):
119
+ expanded_minute += periods_duration[period] - periods_regular[period]
120
+ else:
121
+ break
122
+ return expanded_minute
123
+
124
+
125
+ class EventDataLoader(ABC):
126
+ """Load event data either from a remote location or from a local folder.
127
+
128
+ Parameters
129
+ ----------
130
+ root : str
131
+ Root-path of the data.
132
+ getter : str
133
+ "remote" or "local"
134
+ """
135
+
136
+ @abstractmethod
137
+ def competitions(self) -> DataFrame[Any]:
138
+ """Return a dataframe with all available competitions and seasons.
139
+
140
+ Returns
141
+ -------
142
+ pd.DataFrame
143
+ A dataframe containing all available competitions and seasons. See
144
+ :class:`~socceraction.spadl.base.CompetitionSchema` for the schema.
145
+ """
146
+
147
+ @abstractmethod
148
+ def games(self, competition_id: int, season_id: int) -> DataFrame[Any]:
149
+ """Return a dataframe with all available games in a season.
150
+
151
+ Parameters
152
+ ----------
153
+ competition_id : int
154
+ The ID of the competition.
155
+ season_id : int
156
+ The ID of the season.
157
+
158
+ Returns
159
+ -------
160
+ pd.DataFrame
161
+ A dataframe containing all available games. See
162
+ :class:`~socceraction.spadl.base.GameSchema` for the schema.
163
+ """
164
+
165
+ @abstractmethod
166
+ def teams(self, game_id: int) -> DataFrame[Any]:
167
+ """Return a dataframe with both teams that participated in a game.
168
+
169
+ Parameters
170
+ ----------
171
+ game_id : int
172
+ The ID of the game.
173
+
174
+ Returns
175
+ -------
176
+ pd.DataFrame
177
+ A dataframe containing both teams. See
178
+ :class:`~socceraction.spadl.base.TeamSchema` for the schema.
179
+ """
180
+
181
+ @abstractmethod
182
+ def players(self, game_id: int) -> DataFrame[Any]:
183
+ """Return a dataframe with all players that participated in a game.
184
+
185
+ Parameters
186
+ ----------
187
+ game_id : int
188
+ The ID of the game.
189
+
190
+ Returns
191
+ -------
192
+ pd.DataFrame
193
+ A dataframe containing all players. See
194
+ :class:`~socceraction.spadl.base.PlayerSchema` for the schema.
195
+ """
196
+
197
+ @abstractmethod
198
+ def events(self, game_id: int) -> DataFrame[Any]:
199
+ """Return a dataframe with the event stream of a game.
200
+
201
+ Parameters
202
+ ----------
203
+ game_id : int
204
+ The ID of the game.
205
+
206
+ Returns
207
+ -------
208
+ pd.DataFrame
209
+ A dataframe containing the event stream. See
210
+ :class:`~socceraction.spadl.base.EventSchema` for the schema.
211
+ """
data/opta/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for loading Opta event data."""
2
+
3
+ __all__ = [
4
+ "OptaLoader",
5
+ "OptaCompetitionSchema",
6
+ "OptaGameSchema",
7
+ "OptaPlayerSchema",
8
+ "OptaTeamSchema",
9
+ "OptaEventSchema",
10
+ ]
11
+
12
+ from .loader import OptaLoader
13
+ from .schema import (
14
+ OptaCompetitionSchema,
15
+ OptaEventSchema,
16
+ OptaGameSchema,
17
+ OptaPlayerSchema,
18
+ OptaTeamSchema,
19
+ )
data/opta/loader.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements serializers for Opta data."""
2
+
3
+ import copy
4
+ import datetime
5
+ import glob
6
+ import os
7
+ import re
8
+ import warnings
9
+ from collections.abc import Mapping
10
+ from pathlib import Path
11
+ from typing import Any, Optional, Union, cast
12
+
13
+ import pandas as pd # type: ignore
14
+ from pandera.typing import DataFrame
15
+
16
+ from socceraction.data.base import EventDataLoader
17
+
18
+ from .parsers import (
19
+ F1JSONParser,
20
+ F7XMLParser,
21
+ F9JSONParser,
22
+ F24JSONParser,
23
+ F24XMLParser,
24
+ MA1JSONParser,
25
+ MA3JSONParser,
26
+ OptaParser,
27
+ WhoScoredParser,
28
+ )
29
+ from .schema import (
30
+ OptaCompetitionSchema,
31
+ OptaEventSchema,
32
+ OptaGameSchema,
33
+ OptaPlayerSchema,
34
+ OptaTeamSchema,
35
+ )
36
+
37
+ _jsonparsers = {
38
+ "f1": F1JSONParser,
39
+ "f9": F9JSONParser,
40
+ "f24": F24JSONParser,
41
+ "ma1": MA1JSONParser,
42
+ "ma3": MA3JSONParser,
43
+ }
44
+
45
+ _xmlparsers = {
46
+ "f7": F7XMLParser,
47
+ "f24": F24XMLParser,
48
+ }
49
+
50
+ _statsperformparsers = {
51
+ "ma1": MA1JSONParser,
52
+ "ma3": MA3JSONParser,
53
+ }
54
+
55
+ _whoscoredparsers = {
56
+ "whoscored": WhoScoredParser,
57
+ }
58
+
59
+ _eventtypesdf = pd.DataFrame(
60
+ [
61
+ (1, "pass"),
62
+ (2, "offside pass"),
63
+ (3, "take on"),
64
+ (4, "foul"),
65
+ (5, "out"),
66
+ (6, "corner awarded"),
67
+ (7, "tackle"),
68
+ (8, "interception"),
69
+ (9, "turnover"),
70
+ (10, "save"),
71
+ (11, "claim"),
72
+ (12, "clearance"),
73
+ (13, "miss"),
74
+ (14, "post"),
75
+ (15, "attempt saved"),
76
+ (16, "goal"),
77
+ (17, "card"),
78
+ (18, "player off"),
79
+ (19, "player on"),
80
+ (20, "player retired"),
81
+ (21, "player returns"),
82
+ (22, "player becomes goalkeeper"),
83
+ (23, "goalkeeper becomes player"),
84
+ (24, "condition change"),
85
+ (25, "official change"),
86
+ (26, "unknown26"),
87
+ (27, "start delay"),
88
+ (28, "end delay"),
89
+ (29, "unknown29"),
90
+ (30, "end"),
91
+ (31, "unknown31"),
92
+ (32, "start"),
93
+ (33, "unknown33"),
94
+ (34, "team set up"),
95
+ (35, "player changed position"),
96
+ (36, "player changed jersey number"),
97
+ (37, "collection end"),
98
+ (38, "temp_goal"),
99
+ (39, "temp_attempt"),
100
+ (40, "formation change"),
101
+ (41, "punch"),
102
+ (42, "good skill"),
103
+ (43, "deleted event"),
104
+ (44, "aerial"),
105
+ (45, "challenge"),
106
+ (46, "unknown46"),
107
+ (47, "rescinded card"),
108
+ (48, "unknown46"),
109
+ (49, "ball recovery"),
110
+ (50, "dispossessed"),
111
+ (51, "error"),
112
+ (52, "keeper pick-up"),
113
+ (53, "cross not claimed"),
114
+ (54, "smother"),
115
+ (55, "offside provoked"),
116
+ (56, "shield ball opp"),
117
+ (57, "foul throw in"),
118
+ (58, "penalty faced"),
119
+ (59, "keeper sweeper"),
120
+ (60, "chance missed"),
121
+ (61, "ball touch"),
122
+ (62, "unknown62"),
123
+ (63, "temp_save"),
124
+ (64, "resume"),
125
+ (65, "contentious referee decision"),
126
+ (66, "possession data"),
127
+ (67, "50/50"),
128
+ (68, "referee drop ball"),
129
+ (69, "failed to block"),
130
+ (70, "injury time announcement"),
131
+ (71, "coach setup"),
132
+ (72, "caught offside"),
133
+ (73, "other ball contact"),
134
+ (74, "blocked pass"),
135
+ (75, "delayed start"),
136
+ (76, "early end"),
137
+ (77, "player off pitch"),
138
+ (78, "temp card"),
139
+ (79, "coverage interruption"),
140
+ (80, "drop of ball"),
141
+ (81, "obstacle"),
142
+ (83, "attempted tackle"),
143
+ (84, "deleted after review"),
144
+ (10000, "offside given"), # Seems specific to WhoScored
145
+ ],
146
+ columns=["type_id", "type_name"],
147
+ )
148
+
149
+
150
+ def _deepupdate(target: dict[Any, Any], src: dict[Any, Any]) -> None:
151
+ """Deep update target dict with src.
152
+
153
+ For each k,v in src: if k doesn't exist in target, it is deep copied from
154
+ src to target. Otherwise, if v is a list, target[k] is extended with
155
+ src[k]. If v is a set, target[k] is updated with v, If v is a dict,
156
+ recursively deep-update it.
157
+
158
+ Parameters
159
+ ----------
160
+ target: dict
161
+ The original dictionary which is updated.
162
+ src: dict
163
+ The dictionary with which `target` is updated.
164
+
165
+ Examples
166
+ --------
167
+ >>> t = {'name': 'ferry', 'hobbies': ['programming', 'sci-fi']}
168
+ >>> deepupdate(t, {'hobbies': ['gaming']})
169
+ >>> print(t)
170
+ {'name': 'ferry', 'hobbies': ['programming', 'sci-fi', 'gaming']}
171
+ """
172
+ for k, v in src.items():
173
+ if isinstance(v, list):
174
+ if k not in target:
175
+ target[k] = copy.deepcopy(v)
176
+ else:
177
+ target[k].extend(v)
178
+ elif isinstance(v, dict):
179
+ if k not in target:
180
+ target[k] = copy.deepcopy(v)
181
+ else:
182
+ _deepupdate(target[k], v)
183
+ elif isinstance(v, set):
184
+ if k not in target:
185
+ target[k] = v.copy()
186
+ else:
187
+ target[k].update(v.copy())
188
+ else:
189
+ target[k] = copy.copy(v)
190
+
191
+
192
+ def _extract_ids_from_path(path: str, pattern: str) -> dict[str, Union[str, int]]:
193
+ regex = re.compile(
194
+ ".+?"
195
+ + re.escape(pattern)
196
+ .replace(r"\{competition_id\}", r"(?P<competition_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
197
+ .replace(r"\{season_id\}", r"(?P<season_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
198
+ .replace(r"\{game_id\}", r"(?P<game_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)")
199
+ )
200
+ m = re.match(regex, path)
201
+ if m is None:
202
+ raise ValueError(f"The filepath {path} does not match the format {pattern}.")
203
+ ids = m.groupdict()
204
+ return {k: int(v) if v.isdigit() else v for k, v in ids.items()}
205
+
206
+
207
+ class OptaLoader(EventDataLoader):
208
+ """Load Opta data feeds from a local folder.
209
+
210
+ Parameters
211
+ ----------
212
+ root : str
213
+ Root-path of the data.
214
+ parser : str or dict
215
+ Either 'xml', 'json', 'statsperform', 'whoscored' or a dict with
216
+ a custom parser for each feed. The default xml parser supports F7 and
217
+ F24 feeds; the default json parser supports F1, F9 and F24 feeds, the
218
+ StatsPerform parser supports MA1 and MA3 feeds. Custom parsers can be
219
+ specified as::
220
+
221
+ {
222
+ 'feed1_name': Feed1Parser
223
+ 'feed2_name': Feed2Parser
224
+ }
225
+
226
+ where Feed1Parser and Feed2Parser are classes implementing
227
+ :class:`~socceraction.spadl.opta.OptaParser` and 'feed1_name' and
228
+ 'feed2_name' are a unique ID for each feed that matches to the keys in
229
+ `feeds`.
230
+ feeds : dict
231
+ Glob pattern describing from which files the data from a specific game
232
+ can be retrieved. For example, if files are named::
233
+
234
+ f7-1-2021-17362.xml
235
+ f24-1-2021-17362.xml
236
+
237
+ use::
238
+
239
+ feeds = {
240
+ 'f7': "f7-{competition_id}-{season_id}-{game_id}.xml",
241
+ 'f24': "f24-{competition_id}-{season_id}-{game_id}.xml"
242
+ }
243
+
244
+ Raises
245
+ ------
246
+ ValueError
247
+ If an invalid parser is provided.
248
+ """
249
+
250
+ def __init__( # noqa: C901
251
+ self,
252
+ root: str,
253
+ parser: Union[str, Mapping[str, type[OptaParser]]] = "xml",
254
+ feeds: Optional[dict[str, str]] = None,
255
+ ) -> None:
256
+ self.root = root
257
+ if parser == "json":
258
+ if feeds is None:
259
+ feeds = {
260
+ "f1": "f1-{competition_id}-{season_id}.json",
261
+ "f9": "f9-{competition_id}-{season_id}-{game_id}.json",
262
+ "f24": "f24-{competition_id}-{season_id}-{game_id}.json",
263
+ }
264
+ self.parsers = self._get_parsers_for_feeds(_jsonparsers, feeds)
265
+ elif parser == "xml":
266
+ if feeds is None:
267
+ feeds = {
268
+ "f7": "f7-{competition_id}-{season_id}-{game_id}.xml",
269
+ "f24": "f24-{competition_id}-{season_id}-{game_id}.xml",
270
+ }
271
+ self.parsers = self._get_parsers_for_feeds(_xmlparsers, feeds)
272
+ elif parser == "statsperform":
273
+ if feeds is None:
274
+ feeds = {
275
+ "ma1": "ma1-{competition_id}-{season_id}.json",
276
+ "ma3": "ma3-{competition_id}-{season_id}-{game_id}.json",
277
+ }
278
+ self.parsers = self._get_parsers_for_feeds(_statsperformparsers, feeds)
279
+ elif parser == "whoscored":
280
+ if feeds is None:
281
+ feeds = {
282
+ "whoscored": "{competition_id}-{season_id}-{game_id}.json",
283
+ }
284
+ self.parsers = self._get_parsers_for_feeds(_whoscoredparsers, feeds)
285
+ elif isinstance(parser, dict):
286
+ if feeds is None:
287
+ raise ValueError("You must specify a feed for each parser.")
288
+ self.parsers = self._get_parsers_for_feeds(parser, feeds)
289
+ else:
290
+ raise ValueError("Invalid parser provided.")
291
+ self.feeds = {k: str(Path(v)) for k, v in feeds.items()}
292
+
293
+ def _get_parsers_for_feeds(
294
+ self, available_parsers: Mapping[str, type[OptaParser]], feeds: dict[str, str]
295
+ ) -> Mapping[str, type[OptaParser]]:
296
+ """Select the appropriate parser for each feed.
297
+
298
+ Parameters
299
+ ----------
300
+ available_parsers : dict(str, OptaParser)
301
+ Dictionary with all available parsers.
302
+ feeds : dict(str, str)
303
+ All feeds that should be parsed.
304
+
305
+ Returns
306
+ -------
307
+ dict(str, OptaParser)
308
+ A mapping between all feeds that should be parsed and the
309
+ corresponding parser class.
310
+
311
+ Warns
312
+ -----
313
+ Raises a warning if there is no parser available for any of the
314
+ provided feeds.
315
+ """
316
+ parsers = {}
317
+ for feed in feeds:
318
+ if feed in available_parsers:
319
+ parsers[feed] = available_parsers[feed]
320
+ else:
321
+ warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.")
322
+ return parsers
323
+
324
+ def competitions(self) -> DataFrame[OptaCompetitionSchema]:
325
+ """Return a dataframe with all available competitions and seasons.
326
+
327
+ Returns
328
+ -------
329
+ pd.DataFrame
330
+ A dataframe containing all available competitions and seasons. See
331
+ :class:`~socceraction.spadl.opta.OptaCompetitionSchema` for the schema.
332
+ """
333
+ data: dict[int, dict[str, Any]] = {}
334
+ loaded_seasons = set()
335
+ for feed, feed_pattern in self.feeds.items():
336
+ glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id="*")
337
+ feed_files = glob.glob(os.path.join(self.root, glob_pattern))
338
+ for ffp in feed_files:
339
+ ids = _extract_ids_from_path(ffp, feed_pattern)
340
+ # For efficiency, we only parse one game for each season. This
341
+ # only works if both the competition and season are part of
342
+ # the file name.
343
+ competition_id = ids.get("competition_id")
344
+ season_id = ids.get("season_id")
345
+ if competition_id is not None and season_id is not None:
346
+ if (competition_id, season_id) in loaded_seasons:
347
+ continue
348
+ else:
349
+ loaded_seasons.add((competition_id, season_id))
350
+ parser = self.parsers[feed](ffp, **ids)
351
+ _deepupdate(data, parser.extract_competitions())
352
+ return cast(DataFrame[OptaCompetitionSchema], pd.DataFrame(list(data.values())))
353
+
354
+ def games(self, competition_id: int, season_id: int) -> DataFrame[OptaGameSchema]:
355
+ """Return a dataframe with all available games in a season.
356
+
357
+ Parameters
358
+ ----------
359
+ competition_id : int
360
+ The ID of the competition.
361
+ season_id : int
362
+ The ID of the season.
363
+
364
+ Returns
365
+ -------
366
+ pd.DataFrame
367
+ A dataframe containing all available games. See
368
+ :class:`~socceraction.spadl.opta.OptaGameSchema` for the schema.
369
+ """
370
+ data: dict[int, dict[str, Any]] = {}
371
+ for feed, feed_pattern in self.feeds.items():
372
+ glob_pattern = feed_pattern.format(
373
+ competition_id=competition_id, season_id=season_id, game_id="*"
374
+ )
375
+ feed_files = glob.glob(os.path.join(self.root, glob_pattern))
376
+ for ffp in feed_files:
377
+ ids = _extract_ids_from_path(ffp, feed_pattern)
378
+ parser = self.parsers[feed](ffp, **ids)
379
+ _deepupdate(data, parser.extract_games())
380
+ return cast(DataFrame[OptaGameSchema], pd.DataFrame(list(data.values())))
381
+
382
+ def teams(self, game_id: int) -> DataFrame[OptaTeamSchema]:
383
+ """Return a dataframe with both teams that participated in a game.
384
+
385
+ Parameters
386
+ ----------
387
+ game_id : int
388
+ The ID of the game.
389
+
390
+ Returns
391
+ -------
392
+ pd.DataFrame
393
+ A dataframe containing both teams. See
394
+ :class:`~socceraction.spadl.opta.OptaTeamSchema` for the schema.
395
+ """
396
+ data: dict[int, dict[str, Any]] = {}
397
+ for feed, feed_pattern in self.feeds.items():
398
+ glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
399
+ feed_files = glob.glob(os.path.join(self.root, glob_pattern))
400
+ for ffp in feed_files:
401
+ ids = _extract_ids_from_path(ffp, feed_pattern)
402
+ parser = self.parsers[feed](ffp, **ids)
403
+ _deepupdate(data, parser.extract_teams())
404
+ return cast(DataFrame[OptaTeamSchema], pd.DataFrame(list(data.values())))
405
+
406
+ def players(self, game_id: int) -> DataFrame[OptaPlayerSchema]:
407
+ """Return a dataframe with all players that participated in a game.
408
+
409
+ Parameters
410
+ ----------
411
+ game_id : int
412
+ The ID of the game.
413
+
414
+ Returns
415
+ -------
416
+ pd.DataFrame
417
+ A dataframe containing all players. See
418
+ :class:`~socceraction.spadl.opta.OptaPlayerSchema` for the schema.
419
+ """
420
+ data: dict[int, dict[str, Any]] = {}
421
+ for feed, feed_pattern in self.feeds.items():
422
+ glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
423
+ feed_files = glob.glob(os.path.join(self.root, glob_pattern))
424
+ for ffp in feed_files:
425
+ ids = _extract_ids_from_path(ffp, feed_pattern)
426
+ parser = self.parsers[feed](ffp, **ids)
427
+ _deepupdate(data, parser.extract_players())
428
+ df_players = pd.DataFrame(list(data.values()))
429
+ df_players["game_id"] = game_id
430
+ return cast(DataFrame[OptaPlayerSchema], df_players)
431
+
432
+ def events(self, game_id: int) -> DataFrame[OptaEventSchema]:
433
+ """Return a dataframe with the event stream of a game.
434
+
435
+ Parameters
436
+ ----------
437
+ game_id : int
438
+ The ID of the game.
439
+
440
+ Returns
441
+ -------
442
+ pd.DataFrame
443
+ A dataframe containing the event stream. See
444
+ :class:`~socceraction.spadl.opta.OptaEventSchema` for the schema.
445
+ """
446
+ data: dict[int, dict[str, Any]] = {}
447
+ for feed, feed_pattern in self.feeds.items():
448
+ glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id)
449
+ feed_files = glob.glob(os.path.join(self.root, glob_pattern))
450
+ for ffp in feed_files:
451
+ ids = _extract_ids_from_path(ffp, feed_pattern)
452
+ parser = self.parsers[feed](ffp, **ids)
453
+ _deepupdate(data, parser.extract_events())
454
+ events = (
455
+ pd.DataFrame(list(data.values()))
456
+ .merge(_eventtypesdf, on="type_id", how="left")
457
+ .sort_values(
458
+ ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort"
459
+ )
460
+ .reset_index(drop=True)
461
+ )
462
+
463
+ # sometimes pre-match events has -3, -2 and -1 seconds
464
+ events.loc[events.second < 0, "second"] = 0
465
+ events = events.sort_values(
466
+ ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort"
467
+ )
468
+
469
+ # deleted events has wrong datetime which occurs OutOfBoundsDatetime error
470
+ events = events[events.type_id != 43]
471
+ events = events[
472
+ ~(
473
+ (events.timestamp < datetime.datetime(1900, 1, 1))
474
+ | (events.timestamp > datetime.datetime(2100, 1, 1))
475
+ )
476
+ ]
477
+
478
+ return cast(DataFrame[OptaEventSchema], events)
data/opta/parsers/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parsers for Opta(-derived) event streams."""
2
+
3
+ __all__ = [
4
+ "OptaParser",
5
+ "F1JSONParser",
6
+ "F9JSONParser",
7
+ "F24JSONParser",
8
+ "F7XMLParser",
9
+ "F24XMLParser",
10
+ "MA1JSONParser",
11
+ "MA3JSONParser",
12
+ "WhoScoredParser",
13
+ ]
14
+
15
+ from .base import OptaParser
16
+ from .f1_json import F1JSONParser
17
+ from .f7_xml import F7XMLParser
18
+ from .f9_json import F9JSONParser
19
+ from .f24_json import F24JSONParser
20
+ from .f24_xml import F24XMLParser
21
+ from .ma1_json import MA1JSONParser
22
+ from .ma3_json import MA3JSONParser
23
+ from .whoscored import WhoScoredParser
data/opta/parsers/base.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base class for all Opta(-derived) event stream parsers.
2
+
3
+ A parser reads a single data file and should extend the 'OptaParser' class to
4
+ extract data about competitions, games, players, teams and events that is
5
+ encoded in the file.
6
+
7
+ """
8
+
9
+ import json # type: ignore
10
+ from typing import Any, Optional
11
+
12
+ from lxml import objectify
13
+
14
+
15
+ class OptaParser:
16
+ """Extract data from an Opta data stream.
17
+
18
+ Parameters
19
+ ----------
20
+ path : str
21
+ Path of the data file.
22
+ """
23
+
24
+ def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
25
+ raise NotImplementedError
26
+
27
+ def extract_competitions(self) -> dict[tuple[Any, Any], dict[str, Any]]:
28
+ """Return a dictionary with all available competitions.
29
+
30
+ Returns
31
+ -------
32
+ dict
33
+ A mapping between (competion ID, season ID) tuples and the
34
+ information available about each competition in the data stream.
35
+ """
36
+ return {}
37
+
38
+ def extract_games(self) -> dict[Any, dict[str, Any]]:
39
+ """Return a dictionary with all available games.
40
+
41
+ Returns
42
+ -------
43
+ dict
44
+ A mapping between game IDs and the information available about
45
+ each game in the data stream.
46
+ """
47
+ return {}
48
+
49
+ def extract_teams(self) -> dict[Any, dict[str, Any]]:
50
+ """Return a dictionary with all available teams.
51
+
52
+ Returns
53
+ -------
54
+ dict
55
+ A mapping between team IDs and the information available about
56
+ each team in the data stream.
57
+ """
58
+ return {}
59
+
60
+ def extract_players(self) -> dict[tuple[Any, Any], dict[str, Any]]:
61
+ """Return a dictionary with all available players.
62
+
63
+ Returns
64
+ -------
65
+ dict
66
+ A mapping between (game ID, player ID) tuples and the information
67
+ available about each player in the data stream.
68
+ """
69
+ return {}
70
+
71
+ def extract_lineups(self) -> dict[Any, dict[str, Any]]:
72
+ """Return a dictionary with the lineup of each team.
73
+
74
+ Returns
75
+ -------
76
+ dict
77
+ A mapping between team IDs and the information available about
78
+ each team's lineup in the data stream.
79
+ """
80
+ return {}
81
+
82
+ def extract_events(self) -> dict[tuple[Any, Any], dict[str, Any]]:
83
+ """Return a dictionary with all available events.
84
+
85
+ Returns
86
+ -------
87
+ dict
88
+ A mapping between (game ID, event ID) tuples and the information
89
+ available about each event in the data stream.
90
+ """
91
+ return {}
92
+
93
+
94
+ class OptaJSONParser(OptaParser):
95
+ """Extract data from an Opta JSON data stream.
96
+
97
+ Parameters
98
+ ----------
99
+ path : str
100
+ Path of the data file.
101
+ """
102
+
103
+ def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
104
+ with open(path, encoding="utf-8") as fh:
105
+ self.root = json.load(fh)
106
+
107
+
108
+ class OptaXMLParser(OptaParser):
109
+ """Extract data from an Opta XML data stream.
110
+
111
+ Parameters
112
+ ----------
113
+ path : str
114
+ Path of the data file.
115
+ """
116
+
117
+ def __init__(self, path: str, **kwargs: Any) -> None: # noqa: ANN401
118
+ with open(path, "rb") as fh:
119
+ self.root = objectify.fromstring(fh.read())
120
+
121
+
122
+ def assertget(dictionary: dict[str, Any], key: str) -> Any: # noqa: ANN401
123
+ """Return the value of the item with the specified key.
124
+
125
+ In contrast to the default `get` method, this version will raise an
126
+ assertion error if the given key is not present in the dict.
127
+
128
+ Parameters
129
+ ----------
130
+ dictionary : dict
131
+ A Python dictionary.
132
+ key : str
133
+ A key in the dictionary.
134
+
135
+ Returns
136
+ -------
137
+ Any
138
+ Returns the value for the specified key if the key is in the dictionary.
139
+
140
+ Raises
141
+ ------
142
+ AssertionError
143
+ If the given key could not be found in the dictionary.
144
+ """
145
+ value = dictionary.get(key)
146
+ assert value is not None, "KeyError: " + key + " not found in " + str(dictionary)
147
+ return value
148
+
149
+
150
+ def _get_end_x(qualifiers: dict[int, Any]) -> Optional[float]:
151
+ try:
152
+ # pass
153
+ if 140 in qualifiers:
154
+ return float(qualifiers[140])
155
+ # blocked shot
156
+ if 146 in qualifiers:
157
+ return float(qualifiers[146])
158
+ # passed the goal line
159
+ if 102 in qualifiers:
160
+ return float(100)
161
+ return None
162
+ except ValueError:
163
+ return None
164
+
165
+
166
+ def _get_end_y(qualifiers: dict[int, Any]) -> Optional[float]:
167
+ try:
168
+ # pass
169
+ if 141 in qualifiers:
170
+ return float(qualifiers[141])
171
+ # blocked shot
172
+ if 147 in qualifiers:
173
+ return float(qualifiers[147])
174
+ # passed the goal line
175
+ if 102 in qualifiers:
176
+ return float(qualifiers[102])
177
+ return None
178
+ except ValueError:
179
+ return None
data/opta/parsers/f1_json.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for Opta F1 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any
5
+
6
+ from ...base import MissingDataError
7
+ from .base import OptaJSONParser, assertget
8
+
9
+
10
+ class F1JSONParser(OptaJSONParser):
11
+ """Extract data from a Opta F1 data stream.
12
+
13
+ Parameters
14
+ ----------
15
+ path : str
16
+ Path of the data file.
17
+ """
18
+
19
+ def _get_feed(self) -> dict[str, Any]:
20
+ for node in self.root:
21
+ if "OptaFeed" in node["data"].keys():
22
+ return node
23
+ raise MissingDataError
24
+
25
+ def _get_doc(self) -> dict[str, Any]:
26
+ f1 = self._get_feed()
27
+ data = assertget(f1, "data")
28
+ optafeed = assertget(data, "OptaFeed")
29
+ optadocument = assertget(optafeed, "OptaDocument")
30
+ return optadocument
31
+
32
+ def extract_competitions(self) -> dict[tuple[int, int], dict[str, Any]]:
33
+ """Return a dictionary with all available competitions.
34
+
35
+ Returns
36
+ -------
37
+ dict
38
+ A mapping between (competion ID, season ID) tuples and the
39
+ information available about each competition in the data stream.
40
+ """
41
+ optadocument = self._get_doc()
42
+ attr = assertget(optadocument, "@attributes")
43
+ competition_id = int(assertget(attr, "competition_id"))
44
+ season_id = int(assertget(attr, "season_id"))
45
+ competition = {
46
+ # Fields required by the base schema
47
+ "season_id": season_id,
48
+ "season_name": str(assertget(attr, "season_id")),
49
+ "competition_id": competition_id,
50
+ "competition_name": assertget(attr, "competition_name"),
51
+ }
52
+ return {(competition_id, season_id): competition}
53
+
54
+ def extract_games(self) -> dict[int, dict[str, Any]]:
55
+ """Return a dictionary with all available games.
56
+
57
+ Returns
58
+ -------
59
+ dict
60
+ A mapping between game IDs and the information available about
61
+ each game in the data stream.
62
+ """
63
+ optadocument = self._get_doc()
64
+ attr = assertget(optadocument, "@attributes")
65
+ matchdata = assertget(optadocument, "MatchData")
66
+ matches = {}
67
+ for match in matchdata:
68
+ matchattr = assertget(match, "@attributes")
69
+ matchinfo = assertget(match, "MatchInfo")
70
+ matchinfoattr = assertget(matchinfo, "@attributes")
71
+ game_id = int(assertget(matchattr, "uID")[1:])
72
+ matches[game_id] = {
73
+ # Fields required by the base schema
74
+ "game_id": game_id,
75
+ "competition_id": int(assertget(attr, "competition_id")),
76
+ "season_id": int(assertget(attr, "season_id")),
77
+ "game_day": int(assertget(matchinfoattr, "MatchDay")),
78
+ "game_date": datetime.strptime(assertget(matchinfo, "Date"), "%Y-%m-%d %H:%M:%S"),
79
+ # home_team_id=see below,
80
+ # away_team_id=see below,
81
+ # Optional fields
82
+ # home_score=see below,
83
+ # away_score=see below,
84
+ # duration=?
85
+ # referee=?
86
+ # venue=?,
87
+ # attendance=?
88
+ # home_manager=?
89
+ # away_manager=?
90
+ }
91
+ teamdata = assertget(match, "TeamData")
92
+ for team in teamdata:
93
+ teamattr = assertget(team, "@attributes")
94
+ side = assertget(teamattr, "Side")
95
+ teamid = assertget(teamattr, "TeamRef")
96
+ score = assertget(teamattr, "Score")
97
+ if side == "Home":
98
+ matches[game_id]["home_team_id"] = int(teamid[1:])
99
+ matches[game_id]["home_score"] = int(score)
100
+ else:
101
+ matches[game_id]["away_team_id"] = int(teamid[1:])
102
+ matches[game_id]["away_score"] = int(score)
103
+ return matches
data/opta/parsers/f24_json.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for Opta F24 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any
5
+
6
+ from ...base import MissingDataError
7
+ from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget
8
+
9
+
10
+ class F24JSONParser(OptaJSONParser):
11
+ """Extract data from a Opta F24 data stream.
12
+
13
+ Parameters
14
+ ----------
15
+ path : str
16
+ Path of the data file.
17
+ """
18
+
19
+ def _get_doc(self) -> dict[str, Any]:
20
+ for node in self.root:
21
+ if "Games" in node["data"].keys():
22
+ return node
23
+ raise MissingDataError
24
+
25
+ def extract_games(self) -> dict[int, dict[str, Any]]:
26
+ """Return a dictionary with all available games.
27
+
28
+ Returns
29
+ -------
30
+ dict
31
+ A mapping between game IDs and the information available about
32
+ each game in the data stream.
33
+ """
34
+ f24 = self._get_doc()
35
+
36
+ data = assertget(f24, "data")
37
+ games = assertget(data, "Games")
38
+ game = assertget(games, "Game")
39
+ attr = assertget(game, "@attributes")
40
+
41
+ game_id = int(assertget(attr, "id"))
42
+ game_dict = {
43
+ game_id: {
44
+ # Fields required by the base schema
45
+ "game_id": game_id,
46
+ "season_id": int(assertget(attr, "season_id")),
47
+ "competition_id": int(assertget(attr, "competition_id")),
48
+ "game_day": int(assertget(attr, "matchday")),
49
+ "game_date": datetime.strptime(
50
+ assertget(assertget(attr, "game_date"), "locale"), "%Y-%m-%dT%H:%M:%S.%fZ"
51
+ ).replace(tzinfo=None),
52
+ "home_team_id": int(assertget(attr, "home_team_id")),
53
+ "away_team_id": int(assertget(attr, "away_team_id")),
54
+ # Fields required by the opta schema
55
+ # home_score=?
56
+ # away_score=?
57
+ # duration=?
58
+ # referee=?
59
+ # venue=?,
60
+ # attendance=?
61
+ # Optional fields
62
+ # home_manager=?
63
+ # away_manager=?
64
+ }
65
+ }
66
+ return game_dict
67
+
68
+ def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
69
+ """Return a dictionary with all available events.
70
+
71
+ Returns
72
+ -------
73
+ dict
74
+ A mapping between (game ID, event ID) tuples and the information
75
+ available about each event in the data stream.
76
+ """
77
+ f24 = self._get_doc()
78
+
79
+ data = assertget(f24, "data")
80
+ games = assertget(data, "Games")
81
+ game = assertget(games, "Game")
82
+ game_attr = assertget(game, "@attributes")
83
+ game_id = int(assertget(game_attr, "id"))
84
+
85
+ events = {}
86
+ for element in assertget(game, "Event"):
87
+ attr = element["@attributes"]
88
+ timestamp = attr["TimeStamp"].get("locale") if attr.get("TimeStamp") else None
89
+ timestamp = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
90
+ qualifiers = {
91
+ int(q["@attributes"]["qualifier_id"]): q["@attributes"]["value"]
92
+ for q in element.get("Q", [])
93
+ }
94
+ start_x = float(assertget(attr, "x"))
95
+ start_y = float(assertget(attr, "y"))
96
+ end_x = _get_end_x(qualifiers)
97
+ end_y = _get_end_y(qualifiers)
98
+
99
+ event_id = int(assertget(attr, "id"))
100
+ events[(game_id, event_id)] = {
101
+ # Fields required by the base schema
102
+ "game_id": game_id,
103
+ "event_id": event_id,
104
+ "period_id": int(assertget(attr, "period_id")),
105
+ "team_id": int(assertget(attr, "team_id")),
106
+ "player_id": int(assertget(attr, "player_id")),
107
+ "type_id": int(assertget(attr, "type_id")),
108
+ # type_name=?, # added in the opta loader
109
+ # Fields required by the opta schema
110
+ "timestamp": timestamp,
111
+ "minute": int(assertget(attr, "min")),
112
+ "second": int(assertget(attr, "sec")),
113
+ "outcome": bool(int(attr.get("outcome", 1))),
114
+ "start_x": start_x,
115
+ "start_y": start_y,
116
+ "end_x": end_x if end_x is not None else start_x,
117
+ "end_y": end_y if end_y is not None else start_y,
118
+ "qualifiers": qualifiers,
119
+ # Optional fields
120
+ "assist": bool(int(attr.get("assist", 0))),
121
+ "keypass": bool(int(attr.get("keypass", 0))),
122
+ }
123
+ return events
data/opta/parsers/f24_xml.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """XML parser for Opta F24 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any
5
+
6
+ from lxml import objectify
7
+
8
+ from .base import OptaXMLParser, _get_end_x, _get_end_y, assertget
9
+
10
+
11
+ class F24XMLParser(OptaXMLParser):
12
+ """Extract data from a Opta F24 data stream.
13
+
14
+ Parameters
15
+ ----------
16
+ path : str
17
+ Path of the data file.
18
+ """
19
+
20
+ def _get_doc(self) -> objectify.ObjectifiedElement:
21
+ return self.root
22
+
23
+ def extract_games(self) -> dict[int, dict[str, Any]]:
24
+ """Return a dictionary with all available games.
25
+
26
+ Returns
27
+ -------
28
+ dict
29
+ A mapping between game IDs and the information available about
30
+ each game in the data stream.
31
+ """
32
+ optadocument = self._get_doc()
33
+ game_elem = optadocument.find("Game")
34
+ attr = game_elem.attrib
35
+ game_id = int(assertget(attr, "id"))
36
+ game_dict = {
37
+ # Fields required by the base schema
38
+ "game_id": game_id,
39
+ "season_id": int(assertget(attr, "season_id")),
40
+ "competition_id": int(assertget(attr, "competition_id")),
41
+ "game_day": int(assertget(attr, "matchday")),
42
+ "game_date": datetime.strptime(assertget(attr, "game_date"), "%Y-%m-%dT%H:%M:%S"),
43
+ "home_team_id": int(assertget(attr, "home_team_id")),
44
+ "away_team_id": int(assertget(attr, "away_team_id")),
45
+ # Optional fields
46
+ "home_score": int(assertget(attr, "home_score")),
47
+ "away_score": int(assertget(attr, "away_score")),
48
+ # duration=?
49
+ # referee=?
50
+ # venue=?
51
+ # attendance=?
52
+ # home_manager=?
53
+ # away_manager=?
54
+ }
55
+ return {game_id: game_dict}
56
+
57
+ def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
58
+ """Return a dictionary with all available events.
59
+
60
+ Returns
61
+ -------
62
+ dict
63
+ A mapping between (game ID, event ID) tuples and the information
64
+ available about each event in the data stream.
65
+ """
66
+ optadocument = self._get_doc()
67
+ game_elm = optadocument.find("Game")
68
+ game_id = int(assertget(game_elm.attrib, "id"))
69
+ events = {}
70
+ for event_elm in game_elm.iterchildren("Event"):
71
+ attr = dict(event_elm.attrib)
72
+ event_id = int(assertget(attr, "id"))
73
+
74
+ qualifiers = {
75
+ int(qualifier_elm.attrib["qualifier_id"]): qualifier_elm.attrib.get("value")
76
+ for qualifier_elm in event_elm.iterchildren("Q")
77
+ }
78
+ start_x = float(assertget(attr, "x"))
79
+ start_y = float(assertget(attr, "y"))
80
+ end_x = _get_end_x(qualifiers)
81
+ end_y = _get_end_y(qualifiers)
82
+
83
+ events[(game_id, event_id)] = {
84
+ # Fields required by the base schema
85
+ "game_id": game_id,
86
+ "event_id": event_id,
87
+ "period_id": int(assertget(attr, "period_id")),
88
+ "team_id": int(assertget(attr, "team_id")),
89
+ "player_id": int(attr["player_id"]) if "player_id" in attr else None,
90
+ "type_id": int(assertget(attr, "type_id")),
91
+ # type_name=?, # added in the opta loader
92
+ # Fields required by the opta schema
93
+ "timestamp": datetime.strptime(
94
+ assertget(attr, "timestamp"), "%Y-%m-%dT%H:%M:%S.%f"
95
+ ),
96
+ "minute": int(assertget(attr, "min")),
97
+ "second": int(assertget(attr, "sec")),
98
+ "outcome": bool(int(attr["outcome"])) if "outcome" in attr else None,
99
+ "start_x": start_x,
100
+ "start_y": start_y,
101
+ "end_x": end_x if end_x is not None else start_x,
102
+ "end_y": end_y if end_y is not None else start_y,
103
+ "qualifiers": qualifiers,
104
+ # Optional fields
105
+ "assist": bool(int(attr.get("assist", 0))),
106
+ "keypass": bool(int(attr.get("keypass", 0))),
107
+ }
108
+ return events
data/opta/parsers/f7_xml.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """XML parser for Opta F7 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any
5
+
6
+ from lxml import objectify
7
+
8
+ from .base import OptaXMLParser, assertget
9
+
10
+
11
+ class F7XMLParser(OptaXMLParser):
12
+ """Extract data from a Opta F7 data stream.
13
+
14
+ Parameters
15
+ ----------
16
+ path : str
17
+ Path of the data file.
18
+ """
19
+
20
+ def _get_doc(self) -> objectify.ObjectifiedElement:
21
+ optadocument = self.root.find("SoccerDocument")
22
+ return optadocument
23
+
24
+ def _get_stats(self, obj: objectify.ObjectifiedElement) -> dict[str, Any]:
25
+ stats = {}
26
+ for stat in obj.find("Stat"):
27
+ stats[stat.attrib["Type"]] = stat.text
28
+ return stats
29
+
30
+ def _get_name(self, obj: objectify.ObjectifiedElement) -> str:
31
+ if "Known" in obj:
32
+ return obj.Known
33
+ return obj.First + " " + obj.Last
34
+
35
+ def extract_competitions(self) -> dict[tuple[int, int], dict[str, Any]]:
36
+ """Return a dictionary with all available competitions.
37
+
38
+ Returns
39
+ -------
40
+ dict
41
+ A mapping between (competion ID, season ID) tuples and the
42
+ information available about each competition in the data stream.
43
+ """
44
+ optadocument = self._get_doc()
45
+ competition = optadocument.Competition
46
+ competition_id = int(competition.attrib["uID"][1:])
47
+ stats = self._get_stats(competition)
48
+ season_id = int(assertget(stats, "season_id"))
49
+ competition_dict = {
50
+ # Fields required by the base schema
51
+ "competition_id": competition_id,
52
+ "season_id": season_id,
53
+ "season_name": assertget(stats, "season_name"),
54
+ "competition_name": competition.Name.text,
55
+ }
56
+ return {(competition_id, season_id): competition_dict}
57
+
58
+ def extract_games(self) -> dict[int, dict[str, Any]]:
59
+ """Return a dictionary with all available games.
60
+
61
+ Returns
62
+ -------
63
+ dict
64
+ A mapping between game IDs and the information available about
65
+ each game in the data stream.
66
+ """
67
+ optadocument = self._get_doc()
68
+ competition = optadocument.Competition
69
+ competition_id = int(competition.attrib["uID"][1:])
70
+ competition_stats = self._get_stats(competition)
71
+ match_info = optadocument.MatchData.MatchInfo
72
+ game_id = int(optadocument.attrib["uID"][1:])
73
+ stats = self._get_stats(optadocument.MatchData)
74
+ team_data_elms = {
75
+ t.attrib["Side"]: t for t in optadocument.MatchData.iterchildren("TeamData")
76
+ }
77
+ team_officials = {}
78
+ for t in optadocument.iterchildren("Team"):
79
+ side = (
80
+ "Home"
81
+ if int(team_data_elms["Home"].attrib["TeamRef"][1:]) == int(t.attrib["uID"][1:])
82
+ else "Away"
83
+ )
84
+ for m in t.iterchildren("TeamOfficial"):
85
+ if m.attrib["Type"] == "Manager":
86
+ team_officials[side] = m
87
+
88
+ game_dict = {
89
+ # Fields required by the base schema
90
+ "game_id": game_id,
91
+ "season_id": int(assertget(competition_stats, "season_id")),
92
+ "competition_id": competition_id,
93
+ "game_day": int(competition_stats["matchday"])
94
+ if "matchday" in competition_stats
95
+ else None,
96
+ "game_date": datetime.strptime(match_info.Date.text, "%Y%m%dT%H%M%S%z").replace(
97
+ tzinfo=None
98
+ ),
99
+ "home_team_id": int(
100
+ assertget(assertget(team_data_elms, "Home").attrib, "TeamRef")[1:]
101
+ ),
102
+ "away_team_id": int(
103
+ assertget(assertget(team_data_elms, "Away").attrib, "TeamRef")[1:]
104
+ ),
105
+ # Optional fields
106
+ "home_score": int(assertget(assertget(team_data_elms, "Home").attrib, "Score")),
107
+ "away_score": int(assertget(assertget(team_data_elms, "Away").attrib, "Score")),
108
+ "duration": int(stats["match_time"]),
109
+ "referee": self._get_name(optadocument.MatchData.MatchOfficial.OfficialName),
110
+ "venue": optadocument.Venue.Name.text,
111
+ "attendance": int(match_info.Attendance),
112
+ "home_manager": self._get_name(team_officials["Home"].PersonName)
113
+ if "Home" in team_officials
114
+ else None,
115
+ "away_manager": self._get_name(team_officials["Away"].PersonName)
116
+ if "Away" in team_officials
117
+ else None,
118
+ }
119
+ return {game_id: game_dict}
120
+
121
+ def extract_teams(self) -> dict[int, dict[str, Any]]:
122
+ """Return a dictionary with all available teams.
123
+
124
+ Returns
125
+ -------
126
+ dict
127
+ A mapping between team IDs and the information available about
128
+ each team in the data stream.
129
+ """
130
+ optadocument = self._get_doc()
131
+ team_elms = list(optadocument.iterchildren("Team"))
132
+ teams = {}
133
+ for team_elm in team_elms:
134
+ team_id = int(assertget(team_elm.attrib, "uID")[1:])
135
+ teams[team_id] = {
136
+ # Fields required by the base schema
137
+ "team_id": team_id,
138
+ "team_name": team_elm.Name.text,
139
+ }
140
+ return teams
141
+
142
+ def extract_lineups(self) -> dict[int, dict[str, Any]]:
143
+ """Return a dictionary with the lineup of each team.
144
+
145
+ Returns
146
+ -------
147
+ dict
148
+ A mapping between team IDs and the information available about
149
+ each team's lineup in the data stream.
150
+ """
151
+ optadocument = self._get_doc()
152
+
153
+ stats = {}
154
+ for stat in optadocument.MatchData.find("Stat"):
155
+ stats[stat.attrib["Type"]] = stat.text
156
+
157
+ lineup_elms = optadocument.MatchData.iterchildren("TeamData")
158
+ lineups = {}
159
+ for team_elm in lineup_elms:
160
+ # lineup attributes
161
+ team_id = int(team_elm.attrib["TeamRef"][1:])
162
+ lineups[team_id] = {
163
+ "formation": team_elm.attrib["Formation"],
164
+ "score": int(team_elm.attrib["Score"]),
165
+ "side": team_elm.attrib["Side"],
166
+ "players": {},
167
+ }
168
+ # substitutes
169
+ subst_elms = team_elm.iterchildren("Substitution")
170
+ subst = [subst_elm.attrib for subst_elm in subst_elms]
171
+ # red_cards
172
+ booking_elms = team_elm.iterchildren("Booking")
173
+ red_cards = {
174
+ int(booking_elm.attrib["PlayerRef"][1:]): int(booking_elm.attrib["Min"])
175
+ for booking_elm in booking_elms
176
+ if "CardType" in booking_elm.attrib
177
+ and booking_elm.attrib["CardType"] in ["Red", "SecondYellow"]
178
+ and "PlayerRef" in booking_elm.attrib # not defined if a coach receives a red card
179
+ }
180
+ # players
181
+ player_elms = team_elm.PlayerLineUp.iterchildren("MatchPlayer")
182
+ for player_elm in player_elms:
183
+ player_id = int(player_elm.attrib["PlayerRef"][1:])
184
+ sub_on = int(
185
+ next(
186
+ (
187
+ item["Time"]
188
+ for item in subst
189
+ if "Retired" not in item and item["SubOn"] == f"p{player_id}"
190
+ ),
191
+ stats["match_time"] if player_elm.attrib["Status"] == "Sub" else 0,
192
+ )
193
+ )
194
+ sub_off = int(
195
+ next(
196
+ (item["Time"] for item in subst if item["SubOff"] == f"p{player_id}"),
197
+ stats["match_time"]
198
+ if player_id not in red_cards
199
+ else red_cards[player_id],
200
+ )
201
+ )
202
+ minutes_played = sub_off - sub_on
203
+ lineups[team_id]["players"][player_id] = {
204
+ "starting_position_id": int(player_elm.attrib["Formation_Place"]),
205
+ "starting_position_name": player_elm.attrib["Position"],
206
+ "jersey_number": int(player_elm.attrib["ShirtNumber"]),
207
+ "is_starter": int(player_elm.attrib["Formation_Place"]) != 0,
208
+ "minutes_played": minutes_played,
209
+ }
210
+ return lineups
211
+
212
+ def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
213
+ """Return a dictionary with all available players.
214
+
215
+ Returns
216
+ -------
217
+ dict
218
+ A mapping between (game ID, player ID) tuples and the information
219
+ available about each player in the data stream.
220
+ """
221
+ optadocument = self._get_doc()
222
+ game_id = int(optadocument.attrib["uID"][1:])
223
+ lineups = self.extract_lineups()
224
+ team_elms = list(optadocument.iterchildren("Team"))
225
+ players = {}
226
+ for team_elm in team_elms:
227
+ team_id = int(team_elm.attrib["uID"][1:])
228
+ for player_elm in team_elm.iterchildren("Player"):
229
+ player_id = int(player_elm.attrib["uID"][1:])
230
+ player = {
231
+ # Fields required by the base schema
232
+ "game_id": game_id,
233
+ "team_id": team_id,
234
+ "player_id": player_id,
235
+ "player_name": self._get_name(player_elm.PersonName),
236
+ "is_starter": lineups[team_id]["players"][player_id]["is_starter"],
237
+ "minutes_played": lineups[team_id]["players"][player_id]["minutes_played"],
238
+ "jersey_number": lineups[team_id]["players"][player_id]["jersey_number"],
239
+ # Fields required by the opta schema
240
+ "starting_position": lineups[team_id]["players"][player_id][
241
+ "starting_position_name"
242
+ ],
243
+ # Optional fields
244
+ # height="?",
245
+ # weight="?",
246
+ # age="?",
247
+ }
248
+ players[(game_id, player_id)] = player
249
+
250
+ return players
data/opta/parsers/f9_json.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for Opta F9 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Optional
5
+
6
+ from ...base import MissingDataError
7
+ from .base import OptaJSONParser, assertget
8
+
9
+
10
+ class F9JSONParser(OptaJSONParser):
11
+ """Extract data from a Opta F9 data stream.
12
+
13
+ Parameters
14
+ ----------
15
+ path : str
16
+ Path of the data file.
17
+ """
18
+
19
+ def _get_feed(self) -> dict[str, Any]:
20
+ for node in self.root:
21
+ if "OptaFeed" in node["data"].keys():
22
+ return node
23
+ raise MissingDataError
24
+
25
+ def _get_doc(self) -> dict[str, Any]:
26
+ f9 = self._get_feed()
27
+ data = assertget(f9, "data")
28
+ optafeed = assertget(data, "OptaFeed")
29
+ optadocument = assertget(optafeed, "OptaDocument")[0]
30
+ return optadocument
31
+
32
+ def _get_stats(self, obj: dict[str, Any]) -> dict[str, Any]:
33
+ if "Stat" not in obj:
34
+ return {}
35
+
36
+ stats = {}
37
+ statobj = obj["Stat"] if isinstance(obj["Stat"], list) else [obj["Stat"]]
38
+ for stat in statobj:
39
+ stats[stat["@attributes"]["Type"]] = stat["@value"]
40
+ return stats
41
+
42
+ def _get_name(self, obj: dict[str, Any]) -> Optional[str]:
43
+ if "Known" in obj and obj["Known"].strip():
44
+ return obj["Known"]
45
+ if "First" in obj and "Last" in obj and obj["Last"].strip() or obj["First"].strip():
46
+ return (obj["First"] + " " + obj["Last"]).strip()
47
+ return None
48
+
49
+ def extract_games(self) -> dict[int, dict[str, Any]]:
50
+ """Return a dictionary with all available games.
51
+
52
+ Returns
53
+ -------
54
+ dict
55
+ A mapping between game IDs and the information available about
56
+ each game in the data stream.
57
+ """
58
+ optadocument = self._get_doc()
59
+ attr = assertget(optadocument, "@attributes")
60
+ matchdata = assertget(optadocument, "MatchData")
61
+ competition = assertget(optadocument, "Competition")
62
+ competitionstat = self._get_stats(competition)
63
+ venue = assertget(optadocument, "Venue")
64
+ matchofficial = assertget(matchdata, "MatchOfficial")
65
+ matchinfo = assertget(matchdata, "MatchInfo")
66
+ matchstat = self._get_stats(matchdata)
67
+ teamdata = assertget(matchdata, "TeamData")
68
+ scores = {}
69
+ for t in teamdata:
70
+ scores[t["@attributes"]["Side"]] = t["@attributes"]["Score"]
71
+
72
+ game_id = int(assertget(attr, "uID")[1:])
73
+ game_dict = {
74
+ # Fields required by the base schema
75
+ "game_id": game_id,
76
+ "competition_id": int(assertget(assertget(competition, "@attributes"), "uID")[1:]),
77
+ "season_id": assertget(competitionstat, "season_id"),
78
+ "game_day": competitionstat["matchday"] if "matchday" in competitionstat else None,
79
+ "game_date": datetime.strptime(
80
+ assertget(matchinfo, "Date"), "%Y%m%dT%H%M%S%z"
81
+ ).replace(tzinfo=None),
82
+ # home_team_id=see below
83
+ # away_team_id=see below
84
+ # Optional fields
85
+ "home_score": int(scores["Home"]),
86
+ "away_score": int(scores["Away"]),
87
+ "duration": int(assertget(matchstat, "match_time")),
88
+ "referee": self._get_name(matchofficial["OfficialName"])
89
+ if "OfficialName" in matchofficial
90
+ else None,
91
+ "venue": venue["Name"] if "Name" in venue else None,
92
+ "attendance": int(matchinfo["Attendance"]) if "Attendance" in matchinfo else None,
93
+ # home_manager=see below
94
+ # away_manager=see below
95
+ }
96
+ for team in teamdata:
97
+ teamattr = assertget(team, "@attributes")
98
+ side = assertget(teamattr, "Side")
99
+ teamid = assertget(teamattr, "TeamRef")
100
+ score = assertget(teamattr, "Score")
101
+ manager = (
102
+ self._get_name(team["TeamOfficial"]["PersonName"])
103
+ if "TeamOfficial" in team
104
+ else None
105
+ )
106
+ if side == "Home":
107
+ game_dict["home_team_id"] = int(teamid[1:])
108
+ game_dict["home_score"] = int(score)
109
+ game_dict["home_manager"] = manager
110
+ else:
111
+ game_dict["away_team_id"] = int(teamid[1:])
112
+ game_dict["away_score"] = int(score)
113
+ game_dict["away_manager"] = manager
114
+ return {game_id: game_dict}
115
+
116
+ def extract_teams(self) -> dict[int, dict[str, Any]]:
117
+ """Return a dictionary with all available teams.
118
+
119
+ Returns
120
+ -------
121
+ dict
122
+ A mapping between team IDs and the information available about
123
+ each team in the data stream.
124
+ """
125
+ optadocument = self._get_doc()
126
+ root_teams = assertget(optadocument, "Team")
127
+
128
+ teams = {}
129
+ for team in root_teams:
130
+ if "id" in team.keys():
131
+ nameobj = team.get("nameObj")
132
+ team_id = int(team["id"])
133
+ teams[team_id] = {
134
+ # Fields required by the base schema
135
+ "team_id": team_id,
136
+ "team_name": nameobj.get("name"),
137
+ }
138
+ return teams
139
+
140
+ def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
141
+ """Return a dictionary with all available players.
142
+
143
+ Returns
144
+ -------
145
+ dict
146
+ A mapping between (game ID, player ID) tuples and the information
147
+ available about each player in the data stream.
148
+ """
149
+ optadocument = self._get_doc()
150
+ attr = assertget(optadocument, "@attributes")
151
+ game_id = int(assertget(attr, "uID")[1:])
152
+ root_teams = assertget(optadocument, "Team")
153
+ lineups = self.extract_lineups()
154
+
155
+ players = {}
156
+ for team in root_teams:
157
+ team_id = int(team["@attributes"]["uID"].replace("t", ""))
158
+ for player in team["Player"]:
159
+ player_id = int(player["@attributes"]["uID"].replace("p", ""))
160
+
161
+ assert "nameObj" in player["PersonName"]
162
+ nameobj = player["PersonName"]["nameObj"]
163
+ if not nameobj.get("is_unknown"):
164
+ player = {
165
+ # Fields required by the base schema
166
+ "game_id": game_id,
167
+ "team_id": team_id,
168
+ "player_id": player_id,
169
+ "player_name": self._get_name(player["PersonName"]),
170
+ # is_starter=
171
+ # minutes_played=
172
+ # jersey_number=
173
+ # Fields required by the opta schema
174
+ # starting_position=
175
+ # Optional fields
176
+ # height="?",
177
+ # weight="?",
178
+ # age="?",
179
+ }
180
+ if player_id in lineups[team_id]["players"]:
181
+ player = dict(
182
+ **player,
183
+ jersey_number=lineups[team_id]["players"][player_id]["jersey_number"],
184
+ starting_position=lineups[team_id]["players"][player_id][
185
+ "starting_position_name"
186
+ ],
187
+ is_starter=lineups[team_id]["players"][player_id]["is_starter"],
188
+ minutes_played=lineups[team_id]["players"][player_id][
189
+ "minutes_played"
190
+ ],
191
+ )
192
+ players[(game_id, player_id)] = player
193
+ return players
194
+
195
+ def extract_lineups(self) -> dict[int, dict[str, Any]]:
196
+ """Return a dictionary with the lineup of each team.
197
+
198
+ Raises
199
+ ------
200
+ MissingDataError
201
+ If teams data is not available in the stream.
202
+
203
+ Returns
204
+ -------
205
+ dict
206
+ A mapping between team IDs and the information available about
207
+ each team's lineup in the data stream.
208
+ """
209
+ optadocument = self._get_doc()
210
+ attr = assertget(optadocument, "@attributes")
211
+
212
+ try:
213
+ rootf9 = optadocument["MatchData"]["TeamData"]
214
+ except KeyError as e:
215
+ raise MissingDataError from e
216
+ matchstats = optadocument["MatchData"]["Stat"]
217
+ matchstats = [matchstats] if isinstance(matchstats, dict) else matchstats
218
+ matchstatsdict = {stat["@attributes"]["Type"]: stat["@value"] for stat in matchstats}
219
+
220
+ lineups: dict[int, dict[str, Any]] = {}
221
+ for team in rootf9:
222
+ # lineup attributes
223
+ team_id = int(team["@attributes"]["TeamRef"].replace("t", ""))
224
+ lineups[team_id] = {"players": {}}
225
+ # substitutes
226
+ subst = [s["@attributes"] for s in team["Substitution"]]
227
+ # red cards
228
+ red_cards = {
229
+ int(e["@attributes"]["PlayerRef"].replace("p", "")): e["@attributes"]["Time"]
230
+ for e in team.get("Booking", [])
231
+ if "CardType" in e["@attributes"]
232
+ and e["@attributes"]["CardType"] in ["Red", "SecondYellow"]
233
+ and "PlayerRef" in e["@attributes"] # not defined if a coach receives a red card
234
+ }
235
+ for player in team["PlayerLineUp"]["MatchPlayer"]:
236
+ attr = player["@attributes"]
237
+ player_id = int(attr["PlayerRef"].replace("p", ""))
238
+ playerstatsdict = {
239
+ stat["@attributes"]["Type"]: stat["@value"] for stat in player["Stat"]
240
+ }
241
+ sub_on = next(
242
+ (
243
+ item["Time"]
244
+ for item in subst
245
+ if "Retired" not in item and item["SubOn"] == f"p{player_id}"
246
+ ),
247
+ matchstatsdict["match_time"] if attr["Status"] == "Sub" else 0,
248
+ )
249
+ sub_off = next(
250
+ (item["Time"] for item in subst if item["SubOff"] == f"p{player_id}"),
251
+ matchstatsdict["match_time"]
252
+ if player_id not in red_cards
253
+ else red_cards[player_id],
254
+ )
255
+ minutes_played = sub_off - sub_on
256
+ lineups[team_id]["players"][player_id] = dict(
257
+ jersey_number=attr["ShirtNumber"],
258
+ starting_position_name=attr["Position"],
259
+ starting_position_id=attr["position_id"],
260
+ is_starter=attr["Status"] == "Start",
261
+ minutes_played=minutes_played,
262
+ **playerstatsdict,
263
+ )
264
+ return lineups
265
+
266
+ def extract_teamgamestats(self) -> list[dict[str, Any]]:
267
+ """Return some aggregated statistics of each team.
268
+
269
+ Raises
270
+ ------
271
+ MissingDataError
272
+ If teams data is not available in the stream.
273
+
274
+ Returns
275
+ -------
276
+ list(dict)
277
+ A dictionary with aggregated team statistics for each team.
278
+ """
279
+ optadocument = self._get_doc()
280
+ attr = assertget(optadocument, "@attributes")
281
+ game_id = int(assertget(attr, "uID")[1:])
282
+
283
+ try:
284
+ rootf9 = optadocument["MatchData"]["TeamData"]
285
+ except KeyError as e:
286
+ raise MissingDataError from e
287
+ teams_gamestats = []
288
+ for team in rootf9:
289
+ attr = team["@attributes"]
290
+ statsdict = self._get_stats(team)
291
+
292
+ team_gamestats = dict(
293
+ game_id=game_id,
294
+ team_id=int(attr["TeamRef"].replace("t", "")),
295
+ side=attr["Side"],
296
+ score=attr["Score"],
297
+ shootout_score=attr["ShootOutScore"],
298
+ **statsdict,
299
+ )
300
+
301
+ teams_gamestats.append(team_gamestats)
302
+ return teams_gamestats
data/opta/parsers/ma1_json.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for Stats Perform MA1 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Optional
5
+
6
+ from ...base import MissingDataError
7
+ from .base import OptaJSONParser, assertget
8
+
9
+
10
+ class MA1JSONParser(OptaJSONParser):
11
+ """Extract data from a Stats Perform MA1 data stream.
12
+
13
+ Parameters
14
+ ----------
15
+ path : str
16
+ Path of the data file.
17
+ """
18
+
19
+ def _get_matches(self) -> list[dict[str, Any]]:
20
+ if "matchInfo" in self.root:
21
+ return [self.root]
22
+ if "match" in self.root:
23
+ return self.root["match"]
24
+ raise MissingDataError
25
+
26
+ def _get_match_info(self, match: dict[str, Any]) -> dict[str, Any]:
27
+ if "matchInfo" in match:
28
+ return match["matchInfo"]
29
+ raise MissingDataError
30
+
31
+ def _get_live_data(self, match: dict[str, Any]) -> dict[str, Any]:
32
+ if "liveData" in match:
33
+ return match["liveData"]
34
+ return {}
35
+
36
+ def _get_name(self, obj: dict[str, Any]) -> Optional[str]:
37
+ if "name" in obj:
38
+ return assertget(obj, "name")
39
+ if "firstName" in obj:
40
+ return f"{assertget(obj, 'firstName')} {assertget(obj, 'lastName')}"
41
+ return None
42
+
43
+ @staticmethod
44
+ def _extract_team_id(teams: list[dict[str, str]], side: str) -> Optional[str]:
45
+ for team in teams:
46
+ team_side = assertget(team, "position")
47
+ if team_side == side:
48
+ team_id = assertget(team, "id")
49
+ return team_id
50
+ raise MissingDataError
51
+
52
+ def extract_competitions(self) -> dict[tuple[str, str], dict[str, Any]]:
53
+ """Return a dictionary with all available competitions.
54
+
55
+ Returns
56
+ -------
57
+ dict
58
+ A mapping between (competion ID, season ID) tuples and the
59
+ information available about each competition in the data stream.
60
+ """
61
+ competitions = {}
62
+ for match in self._get_matches():
63
+ match_info = self._get_match_info(match)
64
+ season = assertget(match_info, "tournamentCalendar")
65
+ season_id = assertget(season, "id")
66
+ competition = assertget(match_info, "competition")
67
+ competition_id = assertget(competition, "id")
68
+ competitions[(competition_id, season_id)] = {
69
+ "season_id": season_id,
70
+ "season_name": assertget(season, "name"),
71
+ "competition_id": competition_id,
72
+ "competition_name": assertget(competition, "name"),
73
+ }
74
+ return competitions
75
+
76
+ def extract_games(self) -> dict[str, dict[str, Any]]:
77
+ """Return a dictionary with all available games.
78
+
79
+ Returns
80
+ -------
81
+ dict
82
+ A mapping between game IDs and the information available about
83
+ each game in the data stream.
84
+ """
85
+ games = {}
86
+ for match in self._get_matches():
87
+ match_info = self._get_match_info(match)
88
+ game_id = assertget(match_info, "id")
89
+ season = assertget(match_info, "tournamentCalendar")
90
+ competition = assertget(match_info, "competition")
91
+ contestant = assertget(match_info, "contestant")
92
+ game_date = assertget(match_info, "date")
93
+ game_time = assertget(match_info, "time")
94
+ game_datetime = f"{game_date} {game_time}"
95
+ venue = assertget(match_info, "venue")
96
+ games[game_id] = {
97
+ # Fields required by the base schema
98
+ "game_id": game_id,
99
+ "competition_id": assertget(competition, "id"),
100
+ "season_id": assertget(season, "id"),
101
+ "game_day": int(match_info["week"]) if "week" in match_info else None,
102
+ "game_date": datetime.strptime(game_datetime, "%Y-%m-%dZ %H:%M:%SZ"),
103
+ "home_team_id": self._extract_team_id(contestant, "home"),
104
+ "away_team_id": self._extract_team_id(contestant, "away"),
105
+ # Optional fields
106
+ # home_score=?,
107
+ # away_score=?,
108
+ # duration=?,
109
+ # referee=?,
110
+ "venue": venue["shortName"] if "shortName" in venue else None,
111
+ # attendance=?,
112
+ # home_manager=?,
113
+ # away_manager=?,
114
+ }
115
+ live_data = self._get_live_data(match)
116
+ if "matchDetails" in live_data:
117
+ match_details = assertget(live_data, "matchDetails")
118
+ if "matchLengthMin" in match_details:
119
+ games[game_id]["duration"] = assertget(match_details, "matchLengthMin")
120
+ if "scores" in match_details:
121
+ scores = assertget(match_details, "scores")
122
+ games[game_id]["home_score"] = assertget(scores, "total")["home"]
123
+ games[game_id]["away_score"] = assertget(scores, "total")["away"]
124
+ if "matchDetailsExtra" in live_data:
125
+ extra_match_details = assertget(live_data, "matchDetailsExtra")
126
+ if "attendance" in extra_match_details:
127
+ games[game_id]["attendance"] = int(
128
+ assertget(extra_match_details, "attendance")
129
+ )
130
+ if "matchOfficial" in extra_match_details:
131
+ for official in assertget(extra_match_details, "matchOfficial"):
132
+ if official["type"] == "Main":
133
+ games[game_id]["referee"] = self._get_name(official)
134
+ return games
135
+
136
+ def extract_teams(self) -> dict[str, dict[str, Any]]:
137
+ """Return a dictionary with all available teams.
138
+
139
+ Returns
140
+ -------
141
+ dict
142
+ A mapping between team IDs and the information available about
143
+ each team in the data stream.
144
+ """
145
+ teams = {}
146
+ for match in self._get_matches():
147
+ match_info = self._get_match_info(match)
148
+ contestants = assertget(match_info, "contestant")
149
+ for contestant in contestants:
150
+ team_id = assertget(contestant, "id")
151
+ team = {
152
+ "team_id": team_id,
153
+ "team_name": assertget(contestant, "name"),
154
+ }
155
+ teams[team_id] = team
156
+ return teams
157
+
158
+ def extract_players(self) -> dict[tuple[str, str], dict[str, Any]]: # noqa: C901
159
+ """Return a dictionary with all available players.
160
+
161
+ Returns
162
+ -------
163
+ dict
164
+ A mapping between player IDs and the information available about
165
+ each player in the data stream.
166
+ """
167
+ players = {}
168
+ subs = self.extract_substitutions()
169
+ for match in self._get_matches():
170
+ match_info = self._get_match_info(match)
171
+ game_id = assertget(match_info, "id")
172
+ live_data = self._get_live_data(match)
173
+ if "lineUp" not in live_data:
174
+ continue
175
+ red_cards = {
176
+ e["playerId"]: e["timeMin"]
177
+ for e in live_data.get("card", [])
178
+ if "type" in e
179
+ and e["type"] in ["Y2C", "RC"]
180
+ and "playerId" in e # not defined if a coach receives a red card
181
+ }
182
+ lineups = assertget(live_data, "lineUp")
183
+ for lineup in lineups:
184
+ team_id = assertget(lineup, "contestantId")
185
+ players_in_lineup = assertget(lineup, "player")
186
+ for individual in players_in_lineup:
187
+ player_id = assertget(individual, "playerId")
188
+ players[(game_id, player_id)] = {
189
+ # Fields required by the base schema
190
+ "game_id": game_id,
191
+ "team_id": team_id,
192
+ "player_id": player_id,
193
+ "player_name": self._get_name(individual),
194
+ "is_starter": assertget(individual, "position") != "Substitute",
195
+ # minutes_played="?",
196
+ "jersey_number": assertget(individual, "shirtNumber"),
197
+ # Fields required by the opta schema
198
+ "starting_position": assertget(individual, "position"),
199
+ }
200
+ if "matchDetails" in live_data and "substitute" in live_data:
201
+ match_details = assertget(live_data, "matchDetails")
202
+ if "matchLengthMin" not in match_details:
203
+ continue
204
+ # Determine when player entered the pitch
205
+ is_starter = assertget(individual, "position") != "Substitute"
206
+ sub_in = [
207
+ s
208
+ for s in subs.values()
209
+ if s["game_id"] == game_id and s["player_in_id"] == player_id
210
+ ]
211
+ if is_starter:
212
+ minute_start = 0
213
+ elif len(sub_in) == 1:
214
+ minute_start = sub_in[0]["minute"]
215
+ else:
216
+ minute_start = None
217
+ # Determine when player left the pitch
218
+ sub_out = [
219
+ s
220
+ for s in subs.values()
221
+ if s["game_id"] == game_id and s["player_out_id"] == player_id
222
+ ]
223
+ duration = assertget(match_details, "matchLengthMin")
224
+ minute_end = duration
225
+ if len(sub_out) == 1:
226
+ minute_end = sub_out[0]["minute"]
227
+ elif player_id in red_cards:
228
+ minute_end = red_cards[player_id]
229
+ # Determin time on the pitch
230
+ if is_starter or minute_start is not None:
231
+ players[(game_id, player_id)]["minutes_played"] = (
232
+ minute_end - minute_start
233
+ )
234
+ else:
235
+ players[(game_id, player_id)]["minutes_played"] = 0
236
+ return players
237
+
238
+ def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]:
239
+ """Return a dictionary with all substitution events.
240
+
241
+ Returns
242
+ -------
243
+ dict
244
+ A mapping between (game ID, player ID) tuples and the information
245
+ available about each substitution in the data stream.
246
+ """
247
+ subs = {}
248
+ for match in self._get_matches():
249
+ match_info = self._get_match_info(match)
250
+ game_id = assertget(match_info, "id")
251
+ live_data = self._get_live_data(match)
252
+ if "substitute" not in live_data:
253
+ continue
254
+ for e in assertget(live_data, "substitute"):
255
+ sub_id = assertget(e, "playerOnId")
256
+ subs[(game_id, sub_id)] = {
257
+ "game_id": game_id,
258
+ "team_id": assertget(e, "contestantId"),
259
+ "period_id": int(assertget(e, "periodId")),
260
+ "minute": int(assertget(e, "timeMin")),
261
+ "player_in_id": assertget(e, "playerOnId"),
262
+ "player_out_id": assertget(e, "playerOffId"),
263
+ }
264
+ return subs
data/opta/parsers/ma3_json.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for Stats Perform MA3 feeds."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Optional
5
+
6
+ import pandas as pd
7
+
8
+ from ...base import MissingDataError
9
+ from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget
10
+
11
+
12
+ class MA3JSONParser(OptaJSONParser):
13
+ """Extract data from a Stats Perform MA3 data stream.
14
+
15
+ Parameters
16
+ ----------
17
+ path : str
18
+ Path of the data file.
19
+ """
20
+
21
+ _position_map = {
22
+ 1: "Goalkeeper",
23
+ 2: "Defender",
24
+ 3: "Midfielder",
25
+ 4: "Forward",
26
+ 5: "Substitute",
27
+ }
28
+
29
+ def _get_match_info(self) -> dict[str, Any]:
30
+ if "matchInfo" in self.root:
31
+ return self.root["matchInfo"]
32
+ raise MissingDataError
33
+
34
+ def _get_live_data(self) -> dict[str, Any]:
35
+ if "liveData" in self.root:
36
+ return self.root["liveData"]
37
+ raise MissingDataError
38
+
39
+ def extract_competitions(self) -> dict[tuple[str, str], dict[str, Any]]:
40
+ """Return a dictionary with all available competitions.
41
+
42
+ Returns
43
+ -------
44
+ dict
45
+ A mapping between competion IDs and the information available about
46
+ each competition in the data stream.
47
+ """
48
+ match_info = self._get_match_info()
49
+ season = assertget(match_info, "tournamentCalendar")
50
+ competition = assertget(match_info, "competition")
51
+ competition_id = assertget(competition, "id")
52
+ season_id = assertget(season, "id")
53
+ season = {
54
+ # Fields required by the base schema
55
+ "season_id": season_id,
56
+ "season_name": assertget(season, "name"),
57
+ "competition_id": competition_id,
58
+ "competition_name": assertget(competition, "name"),
59
+ }
60
+ return {(competition_id, season_id): season}
61
+
62
+ def extract_games(self) -> dict[str, dict[str, Any]]:
63
+ """Return a dictionary with all available games.
64
+
65
+ Returns
66
+ -------
67
+ dict
68
+ A mapping between game IDs and the information available about
69
+ each game in the data stream.
70
+ """
71
+ match_info = self._get_match_info()
72
+ game_id = assertget(match_info, "id")
73
+ season = assertget(match_info, "tournamentCalendar")
74
+ competition = assertget(match_info, "competition")
75
+ contestant = assertget(match_info, "contestant")
76
+ game_date = assertget(match_info, "date")[0:10]
77
+ game_time = assertget(match_info, "time")[0:8]
78
+ game_datetime = f"{game_date}T{game_time}"
79
+ venue = assertget(match_info, "venue")
80
+ game_obj = {
81
+ "game_id": game_id,
82
+ "competition_id": assertget(competition, "id"),
83
+ "season_id": assertget(season, "id"),
84
+ "game_day": int(match_info["week"]) if "week" in match_info else None,
85
+ "game_date": datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"),
86
+ "home_team_id": self._extract_team_id(contestant, "home"),
87
+ "away_team_id": self._extract_team_id(contestant, "away"),
88
+ "venue": assertget(venue, "shortName"),
89
+ }
90
+ live_data = self._get_live_data()
91
+ if "matchDetails" in live_data:
92
+ match_details = assertget(live_data, "matchDetails")
93
+ if "matchLengthMin" in match_details:
94
+ game_obj["duration"] = assertget(match_details, "matchLengthMin")
95
+ if "scores" in match_details:
96
+ scores = assertget(match_details, "scores")
97
+ game_obj["home_score"] = assertget(scores, "total")["home"]
98
+ game_obj["away_score"] = assertget(scores, "total")["away"]
99
+
100
+ return {game_id: game_obj}
101
+
102
+ def extract_teams(self) -> dict[str, dict[str, Any]]:
103
+ """Return a dictionary with all available teams.
104
+
105
+ Returns
106
+ -------
107
+ dict
108
+ A mapping between team IDs and the information available about
109
+ each team in the data stream.
110
+ """
111
+ match_info = self._get_match_info()
112
+ contestants = assertget(match_info, "contestant")
113
+ teams = {}
114
+ for contestant in contestants:
115
+ team_id = assertget(contestant, "id")
116
+ team = {
117
+ # Fields required by the base schema
118
+ "team_id": team_id,
119
+ "team_name": assertget(contestant, "name"),
120
+ }
121
+ teams[team_id] = team
122
+ return teams
123
+
124
+ def extract_players(self) -> dict[tuple[str, str], dict[str, Any]]: # noqa: C901
125
+ """Return a dictionary with all available players.
126
+
127
+ Returns
128
+ -------
129
+ dict
130
+ A mapping between (game ID, player ID) tuples and the information
131
+ available about each player in the data stream.
132
+ """
133
+ match_info = self._get_match_info()
134
+ game_id = assertget(match_info, "id")
135
+ live_data = self._get_live_data()
136
+ events = assertget(live_data, "event")
137
+
138
+ game_duration = self._extract_duration()
139
+ playerid_to_name = {}
140
+
141
+ players_data: dict[str, list[Any]] = {
142
+ "starting_position_id": [],
143
+ "player_id": [],
144
+ "team_id": [],
145
+ "position_in_formation": [],
146
+ "jersey_number": [],
147
+ }
148
+ red_cards = {}
149
+
150
+ for event in events:
151
+ event_type = assertget(event, "typeId")
152
+ if event_type == 34:
153
+ team_id = assertget(event, "contestantId")
154
+ qualifiers = assertget(event, "qualifier")
155
+ for q in qualifiers:
156
+ qualifier_id = assertget(q, "qualifierId")
157
+ value = assertget(q, "value")
158
+ value = value.split(", ")
159
+ if qualifier_id == 30:
160
+ players_data["player_id"] += value
161
+ team = [team_id for _ in range(len(value))]
162
+ players_data["team_id"] += team
163
+ elif qualifier_id == 44:
164
+ value = [int(v) for v in value]
165
+ players_data["starting_position_id"] += value
166
+ elif qualifier_id == 131:
167
+ value = [int(v) for v in value]
168
+ players_data["position_in_formation"] += value
169
+ elif qualifier_id == 59:
170
+ value = [int(v) for v in value]
171
+ players_data["jersey_number"] += value
172
+ elif event_type == 17 and "playerId" in event:
173
+ qualifiers = assertget(event, "qualifier")
174
+ for q in qualifiers:
175
+ qualifier_id = assertget(q, "qualifierId")
176
+ if qualifier_id in [32, 33]:
177
+ red_cards[event["playerId"]] = event["timeMin"]
178
+
179
+ player_id = event.get("playerId")
180
+ if player_id is None:
181
+ continue
182
+ player_name = assertget(event, "playerName")
183
+ if player_id not in playerid_to_name:
184
+ playerid_to_name[player_id] = player_name
185
+
186
+ df_players_data = pd.DataFrame.from_dict(players_data) # type: ignore
187
+
188
+ substitutions = list(self.extract_substitutions().values())
189
+ substitutions_columns = ["player_id", "team_id", "minute_start", "minute_end"]
190
+ df_substitutions = pd.DataFrame(substitutions, columns=substitutions_columns)
191
+ df_substitutions = df_substitutions.groupby(["player_id", "team_id"]).max().reset_index()
192
+ df_substitutions["minute_start"] = df_substitutions["minute_start"].fillna(0)
193
+ df_substitutions["minute_end"] = df_substitutions["minute_end"].fillna(game_duration)
194
+
195
+ if df_substitutions.empty:
196
+ df_players_data["minute_start"] = 0
197
+ df_players_data["minute_end"] = game_duration
198
+ else:
199
+ df_players_data = df_players_data.merge(
200
+ df_substitutions, on=["team_id", "player_id"], how="left"
201
+ )
202
+ df_players_data["minute_end"] = df_players_data.apply(
203
+ lambda row: red_cards[row["player_id"]]
204
+ if row["player_id"] in red_cards
205
+ else row["minute_end"],
206
+ axis=1,
207
+ )
208
+
209
+ df_players_data["is_starter"] = df_players_data["position_in_formation"] > 0
210
+ df_players_data.loc[
211
+ df_players_data["is_starter"] & df_players_data["minute_start"].isnull(),
212
+ "minute_start",
213
+ ] = 0
214
+ df_players_data.loc[
215
+ df_players_data["is_starter"] & df_players_data["minute_end"].isnull(), "minute_end"
216
+ ] = game_duration
217
+
218
+ df_players_data["minutes_played"] = (
219
+ (df_players_data["minute_end"] - df_players_data["minute_start"]).fillna(0).astype(int)
220
+ )
221
+
222
+ players = {}
223
+ for _, player in df_players_data.iterrows():
224
+ if player.minutes_played > 0:
225
+ players[(game_id, player.player_id)] = {
226
+ # Fields required by the base schema
227
+ "game_id": game_id,
228
+ "team_id": player.team_id,
229
+ "player_id": player.player_id,
230
+ "player_name": playerid_to_name[player.player_id],
231
+ "is_starter": player.is_starter,
232
+ "minutes_played": player.minutes_played,
233
+ "jersey_number": player.jersey_number,
234
+ # Fields required by the opta schema
235
+ "starting_position": self._position_map.get(
236
+ player.starting_position_id, "Unknown"
237
+ ),
238
+ }
239
+ return players
240
+
241
+ def extract_events(self) -> dict[tuple[str, int], dict[str, Any]]:
242
+ """Return a dictionary with all available events.
243
+
244
+ Returns
245
+ -------
246
+ dict
247
+ A mapping between (game ID, event ID) tuples and the information
248
+ available about each event in the data stream.
249
+ """
250
+ match_info = self._get_match_info()
251
+ live_data = self._get_live_data()
252
+ game_id = assertget(match_info, "id")
253
+
254
+ events = {}
255
+ for element in assertget(live_data, "event"):
256
+ timestamp_string = assertget(element, "timeStamp")
257
+ timestamp = self._convert_timestamp(timestamp_string)
258
+
259
+ qualifiers = {
260
+ int(q["qualifierId"]): q.get("value") for q in element.get("qualifier", [])
261
+ }
262
+ start_x = float(assertget(element, "x"))
263
+ start_y = float(assertget(element, "y"))
264
+ end_x = _get_end_x(qualifiers)
265
+ end_y = _get_end_y(qualifiers)
266
+
267
+ event_id = int(assertget(element, "id"))
268
+ event = {
269
+ # Fields required by the base schema
270
+ "game_id": game_id,
271
+ "event_id": event_id,
272
+ "period_id": int(assertget(element, "periodId")),
273
+ "team_id": assertget(element, "contestantId"),
274
+ "player_id": element.get("playerId"),
275
+ "type_id": int(assertget(element, "typeId")),
276
+ # Fields required by the opta schema
277
+ "timestamp": timestamp,
278
+ "minute": int(assertget(element, "timeMin")),
279
+ "second": int(assertget(element, "timeSec")),
280
+ "outcome": bool(int(element.get("outcome", 1))),
281
+ "start_x": start_x,
282
+ "start_y": start_y,
283
+ "end_x": end_x if end_x is not None else start_x,
284
+ "end_y": end_y if end_y is not None else start_y,
285
+ "qualifiers": qualifiers,
286
+ # Optional fields
287
+ "assist": bool(int(element.get("assist", 0))),
288
+ "keypass": bool(int(element.get("keyPass", 0))),
289
+ }
290
+ events[(game_id, event_id)] = event
291
+ return events
292
+
293
+ def extract_substitutions(self) -> dict[int, dict[str, Any]]:
294
+ """Return a dictionary with all substitution events.
295
+
296
+ Returns
297
+ -------
298
+ dict
299
+ A mapping between player IDs and the information available about
300
+ each substitution in the data stream.
301
+ """
302
+ live_data = self._get_live_data()
303
+ events = assertget(live_data, "event")
304
+
305
+ subs = {}
306
+ for e in events:
307
+ event_type = assertget(e, "typeId")
308
+ if event_type in (18, 19):
309
+ sub_id = assertget(e, "playerId")
310
+ substitution_data = {
311
+ "player_id": assertget(e, "playerId"),
312
+ "team_id": assertget(e, "contestantId"),
313
+ }
314
+ if event_type == 18:
315
+ substitution_data["minute_end"] = assertget(e, "timeMin")
316
+ else:
317
+ substitution_data["minute_start"] = assertget(e, "timeMin")
318
+ subs[sub_id] = substitution_data
319
+ return subs
320
+
321
+ def _extract_duration(self) -> int:
322
+ live_data = self._get_live_data()
323
+ events = assertget(live_data, "event")
324
+
325
+ game_duration = 90
326
+
327
+ for event in events:
328
+ event_type = assertget(event, "typeId")
329
+ if event_type == 30:
330
+ # todo: add 1st half time
331
+ qualifiers = assertget(event, "qualifier")
332
+ for q in qualifiers:
333
+ qualifier = assertget(q, "qualifierId")
334
+ if qualifier == 209:
335
+ new_duration = assertget(event, "timeMin")
336
+ if new_duration > game_duration:
337
+ game_duration = new_duration
338
+
339
+ return game_duration
340
+
341
+ @staticmethod
342
+ def _extract_team_id(teams: list[dict[str, str]], side: str) -> Optional[str]:
343
+ for team in teams:
344
+ team_side = assertget(team, "position")
345
+ if team_side == side:
346
+ team_id = assertget(team, "id")
347
+ return team_id
348
+ raise MissingDataError
349
+
350
+ @staticmethod
351
+ def _convert_timestamp(timestamp_string: str) -> datetime:
352
+ try:
353
+ return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%S.%fZ")
354
+ except ValueError:
355
+ return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%SZ")
data/opta/parsers/whoscored.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON parser for WhoScored feeds."""
2
+
3
+ import json # type: ignore
4
+ import re
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Optional
7
+
8
+ from ...base import MissingDataError
9
+ from .base import OptaParser, _get_end_x, _get_end_y, assertget
10
+
11
+
12
+ def _position_mapping(formation: str, x: float, y: float) -> str:
13
+ if x == 0 and y == 5:
14
+ return "GK"
15
+ return "Unknown"
16
+
17
+
18
+ class WhoScoredParser(OptaParser):
19
+ """Extract data from a JSON data stream scraped from WhoScored.
20
+
21
+ Parameters
22
+ ----------
23
+ path : str
24
+ Path of the data file.
25
+ competition_id : int
26
+ ID of the competition to which the provided data file belongs. If
27
+ None, this information is extracted from a field 'competition_id' in
28
+ the JSON.
29
+ season_id : int
30
+ ID of the season to which the provided data file belongs. If None,
31
+ this information is extracted from a field 'season_id' in the JSON.
32
+ game_id : int
33
+ ID of the game to which the provided data file belongs. If None, this
34
+ information is extracted from a field 'game_id' in the JSON.
35
+ """
36
+
37
+ def __init__( # noqa: C901
38
+ self,
39
+ path: str,
40
+ competition_id: Optional[int] = None,
41
+ season_id: Optional[int] = None,
42
+ game_id: Optional[int] = None,
43
+ ) -> None:
44
+ with open(path, encoding="utf-8") as fh:
45
+ self.root = json.load(fh)
46
+
47
+ if competition_id is None:
48
+ try:
49
+ competition_id = int(assertget(self.root, "competition_id"))
50
+ except AssertionError as e:
51
+ raise MissingDataError(
52
+ """Could not determine the competition id. Add it to the
53
+ file path or include a field 'competition_id' in the
54
+ JSON."""
55
+ ) from e
56
+ self.competition_id = competition_id
57
+
58
+ if season_id is None:
59
+ try:
60
+ season_id = int(assertget(self.root, "season_id"))
61
+ except AssertionError as e:
62
+ raise MissingDataError(
63
+ """Could not determine the season id. Add it to the file
64
+ path or include a field 'season_id' in the JSON."""
65
+ ) from e
66
+ self.season_id = season_id
67
+
68
+ if game_id is None:
69
+ try:
70
+ game_id = int(assertget(self.root, "game_id"))
71
+ except AssertionError as e:
72
+ raise MissingDataError(
73
+ """Could not determine the game id. Add it to the file
74
+ path or include a field 'game_id' in the JSON."""
75
+ ) from e
76
+ self.game_id = game_id
77
+
78
+ def _get_period_id(self, event: dict[str, Any]) -> int:
79
+ period = assertget(event, "period")
80
+ period_id = int(assertget(period, "value"))
81
+ return period_id
82
+
83
+ def _get_period_milliseconds(self, event: dict[str, Any]) -> int:
84
+ period_minute_limits = assertget(self.root, "periodMinuteLimits")
85
+ period_id = self._get_period_id(event)
86
+ if period_id == 16: # Pre-match
87
+ return 0
88
+ if period_id == 14: # Post-game
89
+ return 0
90
+ minute = int(assertget(event, "minute"))
91
+ period_minute = minute
92
+ if period_id > 1:
93
+ period_minute = minute - period_minute_limits[str(period_id - 1)]
94
+ period_second = period_minute * 60 + int(event.get("second", 0))
95
+ return period_second * 1000
96
+
97
+ def extract_games(self) -> dict[int, dict[str, Any]]:
98
+ """Return a dictionary with all available games.
99
+
100
+ Returns
101
+ -------
102
+ dict
103
+ A mapping between game IDs and the information available about
104
+ each game in the data stream.
105
+ """
106
+ team_home = assertget(self.root, "home")
107
+ team_away = assertget(self.root, "away")
108
+ game_dict = {
109
+ # Fields required by the base schema
110
+ "game_id": self.game_id,
111
+ "season_id": self.season_id,
112
+ "competition_id": self.competition_id,
113
+ "game_day": None, # Cannot be determined from the data stream
114
+ "game_date": datetime.strptime(
115
+ assertget(self.root, "startTime"), "%Y-%m-%dT%H:%M:%S"
116
+ ), # Dates are UTC
117
+ "home_team_id": int(assertget(team_home, "teamId")),
118
+ "away_team_id": int(assertget(team_away, "teamId")),
119
+ # Optional fields
120
+ "home_score": int(assertget(assertget(self.root["home"], "scores"), "running")),
121
+ "away_score": int(assertget(assertget(self.root["away"], "scores"), "running")),
122
+ "duration": int(self.root.get("expandedMaxMinute"))
123
+ if "expandedMaxMinute" in self.root
124
+ else None,
125
+ "referee": self.root.get("referee", {}).get("name"),
126
+ "venue": self.root.get("venueName"),
127
+ "attendance": int(self.root.get("attendance")) if "attendance" in self.root else None,
128
+ "home_manager": team_home.get("managerName"),
129
+ "away_manager": team_away.get("managerName"),
130
+ }
131
+ return {self.game_id: game_dict}
132
+
133
+ def extract_teams(self) -> dict[int, dict[str, Any]]:
134
+ """Return a dictionary with all available teams.
135
+
136
+ Returns
137
+ -------
138
+ dict
139
+ A mapping between team IDs and the information available about
140
+ each team in the data stream.
141
+ """
142
+ teams = {}
143
+ for side in [self.root["home"], self.root["away"]]:
144
+ team_id = int(assertget(side, "teamId"))
145
+ teams[team_id] = {
146
+ # Fields required by the base schema
147
+ "team_id": team_id,
148
+ "team_name": assertget(side, "name"),
149
+ }
150
+ return teams
151
+
152
+ def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
153
+ """Return a dictionary with all available players.
154
+
155
+ Returns
156
+ -------
157
+ dict
158
+ A mapping between (game ID, player ID) tuples and the information
159
+ available about each player in the data stream.
160
+ """
161
+ game_id = self.game_id
162
+ player_gamestats = self.extract_playergamestats()
163
+ players = {}
164
+ for team in [self.root["home"], self.root["away"]]:
165
+ team_id = int(assertget(team, "teamId"))
166
+ for p in team["players"]:
167
+ player_id = int(assertget(p, "playerId"))
168
+ players[(game_id, player_id)] = {
169
+ # Fields required by the base schema
170
+ "game_id": game_id,
171
+ "team_id": team_id,
172
+ "player_id": player_id,
173
+ "player_name": assertget(p, "name"),
174
+ "is_starter": bool(p.get("isFirstEleven", False)),
175
+ "minutes_played": player_gamestats[(game_id, player_id)]["minutes_played"],
176
+ "jersey_number": player_gamestats[(game_id, player_id)]["jersey_number"],
177
+ # Fields required by the opta schema
178
+ "starting_position": player_gamestats[(game_id, player_id)]["position_code"],
179
+ # Optional fields
180
+ # WhoScored retrieves player details for the current date,
181
+ # not for the game date. Hence, we do not innclude this
182
+ # info.
183
+ # age=int(p["age"]),
184
+ # height=float(p.get("height", float("NaN"))),
185
+ # weight=float(p.get("weight", float("NaN"))),
186
+ }
187
+ return players
188
+
189
+ def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
190
+ """Return a dictionary with all available events.
191
+
192
+ Returns
193
+ -------
194
+ dict
195
+ A mapping between (game ID, event ID) tuples and the information
196
+ available about each event in the data stream.
197
+ """
198
+ events = {}
199
+
200
+ time_start_str = assertget(self.root, "startTime")
201
+ time_start = datetime.strptime(time_start_str, "%Y-%m-%dT%H:%M:%S")
202
+ for attr in self.root["events"]:
203
+ event_id = int(assertget(attr, "id" if "id" in attr else "eventId"))
204
+ eventtype = attr.get("type", {})
205
+ start_x = float(assertget(attr, "x"))
206
+ start_y = float(assertget(attr, "y"))
207
+ minute = int(assertget(attr, "expandedMinute"))
208
+ second = int(attr.get("second", 0))
209
+ qualifiers = {
210
+ int(q["type"]["value"]): q.get("value", True) for q in attr.get("qualifiers", [])
211
+ }
212
+ end_x = attr.get("endX", _get_end_x(qualifiers))
213
+ end_y = attr.get("endY", _get_end_y(qualifiers))
214
+ events[(self.game_id, event_id)] = {
215
+ # Fields required by the base schema
216
+ "game_id": self.game_id,
217
+ "event_id": event_id,
218
+ "period_id": self._get_period_id(attr),
219
+ "team_id": int(assertget(attr, "teamId")),
220
+ "player_id": int(attr.get("playerId")) if "playerId" in attr else None,
221
+ "type_id": int(assertget(eventtype, "value")),
222
+ # type_name=assertget(eventtype, "displayName"), # added in the opta loader
223
+ # Fields required by the opta schema
224
+ # Timestamp is not availe in the data stream. The returned
225
+ # timestamp is not accurate, but sufficient for camptability
226
+ # with the other Opta data streams.
227
+ "timestamp": (time_start + timedelta(seconds=(minute * 60 + second))),
228
+ "minute": minute,
229
+ "second": second,
230
+ "outcome": bool(attr["outcomeType"].get("value"))
231
+ if "outcomeType" in attr
232
+ else None,
233
+ "start_x": start_x,
234
+ "start_y": start_y,
235
+ "end_x": end_x if end_x is not None else start_x,
236
+ "end_y": end_y if end_y is not None else start_y,
237
+ "qualifiers": qualifiers,
238
+ # Optional fields
239
+ "related_player_id": int(attr.get("relatedPlayerId"))
240
+ if "relatedPlayerId" in attr
241
+ else None,
242
+ "touch": bool(attr.get("isTouch", False)),
243
+ "goal": bool(attr.get("isGoal", False)),
244
+ "shot": bool(attr.get("isShot", False)),
245
+ # assist=bool(attr.get('assist')) if "assist" in attr else None,
246
+ # keypass=bool(attr.get('keypass')) if "keypass" in attr else None,
247
+ }
248
+
249
+ return events
250
+
251
+ def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]:
252
+ """Return a dictionary with all substitution events.
253
+
254
+ Returns
255
+ -------
256
+ dict
257
+ A mapping between (game ID, player ID) tuples and the information
258
+ available about each substitution in the data stream.
259
+ """
260
+ subs = {}
261
+ subonevents = [e for e in self.root["events"] if e["type"].get("value") == 19]
262
+ for e in subonevents:
263
+ sub_id = int(assertget(e, "playerId"))
264
+ sub = {
265
+ "game_id": self.game_id,
266
+ "team_id": int(assertget(e, "teamId")),
267
+ "period_id": self._get_period_id(e),
268
+ "period_milliseconds": self._get_period_milliseconds(e),
269
+ "player_in_id": int(assertget(e, "playerId")),
270
+ "player_out_id": int(assertget(e, "relatedPlayerId")),
271
+ }
272
+ subs[(self.game_id, sub_id)] = sub
273
+ return subs
274
+
275
+ def extract_positions(self) -> dict[tuple[int, int, int], dict[str, Any]]: # noqa: C901
276
+ """Return a dictionary with each player's position during a game.
277
+
278
+ Returns
279
+ -------
280
+ dict
281
+ A mapping between (game ID, player ID, epoch ID) tuples and the
282
+ information available about each player's position in the data stream.
283
+ """
284
+ positions = {}
285
+ for t in [self.root["home"], self.root["away"]]:
286
+ team_id = int(assertget(t, "teamId"))
287
+ for f in assertget(t, "formations"):
288
+ fpositions = assertget(f, "formationPositions")
289
+ playersIds = assertget(f, "playerIds")
290
+ formation = assertget(f, "formationName")
291
+
292
+ period_end_minutes = assertget(self.root, "periodEndMinutes")
293
+ period_minute_limits = assertget(self.root, "periodMinuteLimits")
294
+ start_minute = int(assertget(f, "startMinuteExpanded"))
295
+ end_minute = int(assertget(f, "endMinuteExpanded"))
296
+ for period_id in sorted(period_end_minutes.keys()):
297
+ if period_end_minutes[period_id] > start_minute:
298
+ break
299
+ period_id = int(period_id)
300
+ period_minute = start_minute
301
+ if period_id > 1:
302
+ period_minute = start_minute - period_minute_limits[str(period_id - 1)]
303
+
304
+ for i, p in enumerate(fpositions):
305
+ player_id = int(playersIds[i])
306
+ x = float(assertget(p, "vertical"))
307
+ y = float(assertget(p, "horizontal"))
308
+ position_code = _position_mapping(formation, x, y)
309
+ positions[(self.game_id, player_id, start_minute)] = {
310
+ "game_id": self.game_id,
311
+ "team_id": team_id,
312
+ "player_id": player_id,
313
+ "period_id": period_id,
314
+ "period_milliseconds": (period_minute * 60 * 1000),
315
+ "start_milliseconds": (start_minute * 60 * 1000),
316
+ "end_milliseconds": (end_minute * 60 * 1000),
317
+ "formation_scheme": formation,
318
+ "player_position": position_code,
319
+ "player_position_x": x,
320
+ "player_position_y": y,
321
+ }
322
+ return positions
323
+
324
+ def extract_teamgamestats(self) -> dict[tuple[int, int], dict[str, Any]]:
325
+ """Return some aggregated statistics of each team in a game.
326
+
327
+ Returns
328
+ -------
329
+ list(dict)
330
+ A dictionary with aggregated team statistics for each team.
331
+ """
332
+ teams_gamestats = {}
333
+ teams = [self.root["home"], self.root["away"]]
334
+ for team in teams:
335
+ team_id = int(assertget(team, "teamId"))
336
+ statsdict = {}
337
+ for name in team["stats"]:
338
+ if isinstance(team["stats"][name], dict):
339
+ statsdict[_camel_to_snake(name)] = sum(team["stats"][name].values())
340
+
341
+ scores = assertget(team, "scores")
342
+ teams_gamestats[(self.game_id, team_id)] = dict(
343
+ game_id=self.game_id,
344
+ team_id=team_id,
345
+ side=assertget(team, "field"),
346
+ score=assertget(scores, "fulltime"),
347
+ shootout_score=scores.get("penalty"),
348
+ **{k: statsdict[k] for k in statsdict if not k.endswith("Success")},
349
+ )
350
+
351
+ return teams_gamestats
352
+
353
+ def extract_playergamestats(self) -> dict[tuple[int, int], dict[str, Any]]: # noqa: C901
354
+ """Return some aggregated statistics of each player in a game.
355
+
356
+ Returns
357
+ -------
358
+ dict(dict)
359
+ A dictionary with aggregated team statistics for each player.
360
+ """
361
+ players_gamestats = {}
362
+ for team in [self.root["home"], self.root["away"]]:
363
+ team_id = int(assertget(team, "teamId"))
364
+ red_cards = {
365
+ e["playerId"]: e["expandedMinute"]
366
+ for e in team.get("incidentEvents", [])
367
+ if "cardType" in e
368
+ and e["cardType"]["displayName"] in ["Red", "SecondYellow"]
369
+ and "playerId" in e # not defined if a coach receives a red card
370
+ }
371
+ for player in team["players"]:
372
+ statsdict = {
373
+ _camel_to_snake(name): sum(stat.values())
374
+ for name, stat in player["stats"].items()
375
+ }
376
+ stats = [k for k in statsdict if not k.endswith("success")]
377
+
378
+ player_id = int(assertget(player, "playerId"))
379
+ p = dict(
380
+ game_id=self.game_id,
381
+ team_id=team_id,
382
+ player_id=player_id,
383
+ is_starter=bool(player.get("isFirstEleven", False)),
384
+ position_code=player.get("position", None),
385
+ jersey_number=int(player.get("shirtNo", 0)),
386
+ mvp=bool(player.get("isManOfTheMatch", False)),
387
+ **{k: statsdict[k] for k in stats},
388
+ )
389
+ if "subbedInExpandedMinute" in player:
390
+ p["minute_start"] = player["subbedInExpandedMinute"]
391
+ if "subbedOutExpandedMinute" in player:
392
+ p["minute_end"] = player["subbedOutExpandedMinute"]
393
+ if player_id in red_cards:
394
+ p["minute_end"] = red_cards[player_id]
395
+
396
+ # Did not play
397
+ p["minutes_played"] = 0
398
+ # Played the full game
399
+ if p["is_starter"] and "minute_end" not in p:
400
+ p["minute_start"] = 0
401
+ p["minute_end"] = self.root["expandedMaxMinute"]
402
+ p["minutes_played"] = self.root["expandedMaxMinute"]
403
+ # Started but substituted out
404
+ elif p["is_starter"] and "minute_end" in p:
405
+ p["minute_start"] = 0
406
+ p["minutes_played"] = p["minute_end"]
407
+ # Substitud in and played the remainder of the game
408
+ elif "minute_start" in p and "minute_end" not in p:
409
+ p["minute_end"] = self.root["expandedMaxMinute"]
410
+ p["minutes_played"] = self.root["expandedMaxMinute"] - p["minute_start"]
411
+ # Substitud in and out
412
+ elif "minute_start" in p and "minute_end" in p:
413
+ p["minutes_played"] = p["minute_end"] - p["minute_start"]
414
+
415
+ players_gamestats[(self.game_id, player_id)] = p
416
+ return players_gamestats
417
+
418
+
419
+ def _camel_to_snake(name: str) -> str:
420
+ s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
421
+ return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
data/opta/schema.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SPADL schema for Opta data."""
2
+
3
+ from typing import Optional
4
+
5
+ import pandas as pd
6
+ import pandera as pa
7
+ from pandera.typing import DateTime, Object, Series
8
+
9
+ from socceraction.data.schema import (
10
+ CompetitionSchema,
11
+ EventSchema,
12
+ GameSchema,
13
+ PlayerSchema,
14
+ TeamSchema,
15
+ )
16
+
17
+
18
+ class OptaCompetitionSchema(CompetitionSchema):
19
+ """Definition of a dataframe containing a list of competitions and seasons."""
20
+
21
+
22
+ class OptaGameSchema(GameSchema):
23
+ """Definition of a dataframe containing a list of games."""
24
+
25
+ home_score: Optional[Series[int]] = pa.Field(nullable=True)
26
+ """The final score of the home team."""
27
+ away_score: Optional[Series[int]] = pa.Field(nullable=True)
28
+ """The final score of the away team."""
29
+ duration: Optional[Series[int]] = pa.Field(nullable=True)
30
+ """The total duration of the game in minutes."""
31
+ referee: Optional[Series[str]] = pa.Field(nullable=True)
32
+ """The name of the referee."""
33
+ venue: Optional[Series[str]] = pa.Field(nullable=True)
34
+ """The name of the stadium where the game was played."""
35
+ attendance: Optional[Series[int]] = pa.Field(nullable=True)
36
+ """The number of people who attended the game."""
37
+ home_manager: Optional[Series[str]] = pa.Field(nullable=True)
38
+ """The name of the manager of the home team."""
39
+ away_manager: Optional[Series[str]] = pa.Field(nullable=True)
40
+ """The name of the manager of the away team."""
41
+
42
+
43
+ class OptaPlayerSchema(PlayerSchema):
44
+ """Definition of a dataframe containing the list of players of a game."""
45
+
46
+ starting_position: Series[str]
47
+ """The starting position of the player."""
48
+
49
+
50
+ class OptaTeamSchema(TeamSchema):
51
+ """Definition of a dataframe containing the list of teams of a game."""
52
+
53
+
54
+ class OptaEventSchema(EventSchema):
55
+ """Definition of a dataframe containing event stream data of a game."""
56
+
57
+ timestamp: Series[DateTime]
58
+ """Time in the match the event takes place, recorded to the millisecond."""
59
+ minute: Series[int]
60
+ """The minutes on the clock at the time of this event."""
61
+ second: Series[int] = pa.Field(ge=0, le=59)
62
+ """The second part of the timestamp."""
63
+ outcome: Series[bool]
64
+ """Whether the event had a successful outcome or not."""
65
+ start_x: Series[float] = pa.Field(nullable=True)
66
+ """The x coordinate of the location where the event started."""
67
+ start_y: Series[float] = pa.Field(nullable=True)
68
+ """The y coordinate of the location where the event started."""
69
+ end_x: Series[float] = pa.Field(nullable=True)
70
+ """The x coordinate of the location where the event ended."""
71
+ end_y: Series[float] = pa.Field(nullable=True)
72
+ """The y coordinate of the location where the event ended."""
73
+ qualifiers: Series[Object]
74
+ """A JSON object containing the Opta qualifiers of the event."""
75
+ assist: Optional[Series[bool]]
76
+ """Whether the event was an assist or not."""
77
+ keypass: Optional[Series[bool]]
78
+ """Whether the event was a keypass or not."""
79
+ goal: Optional[Series[bool]]
80
+ """Whether the event was a goal or not."""
81
+ shot: Optional[Series[bool]]
82
+ """Whether the event was a shot or not."""
83
+ touch: Optional[Series[bool]]
84
+ """Whether the event was a on-the-ball action or not."""
85
+ related_player_id: Optional[Series[pd.Int64Dtype]] = pa.Field(nullable=True)
86
+ """The ID of a second player that was involved in this event."""
data/schema.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base schemas used by all event stream serializers.
2
+
3
+ Each serializer should create dataframes that contain at least the fields
4
+ included in these base schemas. Each serializer can add different additional
5
+ fields on top.
6
+
7
+ """
8
+
9
+ import pandas as pd
10
+ import pandera as pa
11
+ from pandera.typing import DateTime, Object, Series
12
+
13
+
14
+ class CompetitionSchema(pa.SchemaModel):
15
+ """Definition of a dataframe containing a list of competitions and seasons."""
16
+
17
+ season_id: Series[Object] = pa.Field()
18
+ """The unique identifier for the season."""
19
+ season_name: Series[str] = pa.Field()
20
+ """The name of the season."""
21
+ competition_id: Series[Object] = pa.Field()
22
+ """The unique identifier for the competition."""
23
+ competition_name: Series[str] = pa.Field()
24
+ """The name of the competition."""
25
+
26
+ class Config: # noqa: D106
27
+ strict = True
28
+ coerce = True
29
+
30
+
31
+ class GameSchema(pa.SchemaModel):
32
+ """Definition of a dataframe containing a list of games."""
33
+
34
+ game_id: Series[Object] = pa.Field()
35
+ """The unique identifier for the game."""
36
+ season_id: Series[Object] = pa.Field()
37
+ """The unique identifier for the season."""
38
+ competition_id: Series[Object] = pa.Field()
39
+ """The unique identifier for the competition."""
40
+ game_day: Series[pd.Int64Dtype] = pa.Field(nullable=True)
41
+ """Number corresponding to the weeks or rounds into the competition this game is."""
42
+ game_date: Series[DateTime] = pa.Field()
43
+ """The date when the game was played."""
44
+ home_team_id: Series[Object] = pa.Field()
45
+ """The unique identifier for the home team in this game."""
46
+ away_team_id: Series[Object] = pa.Field()
47
+ """The unique identifier for the away team in this game."""
48
+
49
+ class Config: # noqa: D106
50
+ strict = True
51
+ coerce = True
52
+
53
+
54
+ class TeamSchema(pa.SchemaModel):
55
+ """Definition of a dataframe containing the list of teams of a game."""
56
+
57
+ team_id: Series[Object] = pa.Field()
58
+ """The unique identifier for the team."""
59
+ team_name: Series[str] = pa.Field()
60
+ """The name of the team."""
61
+
62
+ class Config: # noqa: D106
63
+ strict = True
64
+ coerce = True
65
+
66
+
67
+ class PlayerSchema(pa.SchemaModel):
68
+ """Definition of a dataframe containing the list of players on the teamsheet of a game."""
69
+
70
+ game_id: Series[Object] = pa.Field()
71
+ """The unique identifier for the game."""
72
+ team_id: Series[Object] = pa.Field()
73
+ """The unique identifier for the player's team."""
74
+ player_id: Series[Object] = pa.Field()
75
+ """The unique identifier for the player."""
76
+ player_name: Series[str] = pa.Field()
77
+ """The name of the player."""
78
+ is_starter: Series[bool] = pa.Field()
79
+ """Whether the player is in the starting lineup."""
80
+ minutes_played: Series[int] = pa.Field()
81
+ """The number of minutes the player played in the game."""
82
+ jersey_number: Series[int] = pa.Field()
83
+ """The player's jersey number."""
84
+
85
+ class Config: # noqa: D106
86
+ strict = True
87
+ coerce = True
88
+
89
+
90
+ class EventSchema(pa.SchemaModel):
91
+ """Definition of a dataframe containing event stream data of a game."""
92
+
93
+ game_id: Series[Object] = pa.Field()
94
+ """The unique identifier for the game."""
95
+ event_id: Series[Object] = pa.Field()
96
+ """The unique identifier for the event."""
97
+ period_id: Series[int] = pa.Field()
98
+ """The unique identifier for the part of the game in which the event took place."""
99
+ team_id: Series[Object] = pa.Field(nullable=True)
100
+ """The unique identifier for the team this event relates to."""
101
+ player_id: Series[Object] = pa.Field(nullable=True)
102
+ """The unique identifier for the player this event relates to."""
103
+ type_id: Series[int] = pa.Field()
104
+ """The unique identifier for the type of this event."""
105
+ type_name: Series[str] = pa.Field()
106
+ """The name of the type of this event."""
107
+
108
+ class Config: # noqa: D106
109
+ strict = True
110
+ coerce = True
data/statsbomb/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for loading StatsBomb event data."""
2
+
3
+ __all__ = [
4
+ "StatsBombLoader",
5
+ "extract_player_games",
6
+ "StatsBombCompetitionSchema",
7
+ "StatsBombGameSchema",
8
+ "StatsBombPlayerSchema",
9
+ "StatsBombTeamSchema",
10
+ "StatsBombEventSchema",
11
+ ]
12
+
13
+ from .loader import StatsBombLoader, extract_player_games
14
+ from .schema import (
15
+ StatsBombCompetitionSchema,
16
+ StatsBombEventSchema,
17
+ StatsBombGameSchema,
18
+ StatsBombPlayerSchema,
19
+ StatsBombTeamSchema,
20
+ )
data/statsbomb/loader.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements serializers for StatsBomb data."""
2
+
3
+ import os
4
+ from typing import Any, Optional, cast
5
+
6
+ import pandas as pd # type: ignore
7
+ from pandera.typing import DataFrame
8
+
9
+ try:
10
+ from statsbombpy import sb
11
+ except ImportError:
12
+ sb = None
13
+
14
+ from socceraction.data.base import (
15
+ EventDataLoader,
16
+ ParseError,
17
+ _expand_minute,
18
+ _localloadjson,
19
+ )
20
+
21
+ from .schema import (
22
+ StatsBombCompetitionSchema,
23
+ StatsBombEventSchema,
24
+ StatsBombGameSchema,
25
+ StatsBombPlayerSchema,
26
+ StatsBombTeamSchema,
27
+ )
28
+
29
+
30
+ class StatsBombLoader(EventDataLoader):
31
+ """Load Statsbomb data either from a remote location or from a local folder.
32
+
33
+ To load remote data, this loader uses the `statsbombpy
34
+ <https://github.com/statsbomb/statsbombpy>`__ package. Data can be retrieved
35
+ from the StatsBomb API and from the `Open Data GitHub repo
36
+ <https://github.com/statsbomb/open-data/>`__.
37
+ API access is for paying customers only. Authentication can be done by
38
+ setting environment variables named ``SB_USERNAME`` and ``SB_PASSWORD`` to
39
+ your login credentials. Alternatively, pass your login credentials using
40
+ the ``creds`` parameter.
41
+ StatsBomb's open data can be accessed without the need of authentication
42
+ but its use is subject to a `user agreement
43
+ <https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf>`__.
44
+
45
+ To load local data, point ``root`` to the root folder of the data. This folder
46
+ should use the same directory structure as used in the Open Data GitHub repo.
47
+
48
+ Parameters
49
+ ----------
50
+ getter : str
51
+ "remote" or "local"
52
+ root : str, optional
53
+ Root-path of the data. Only used when getter is "local".
54
+ creds: dict, optional
55
+ Login credentials in the format {"user": "", "passwd": ""}. Only used
56
+ when getter is "remote".
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ getter: str = "remote",
62
+ root: Optional[str] = None,
63
+ creds: Optional[dict[str, str]] = None,
64
+ ) -> None:
65
+ if getter == "remote":
66
+ if sb is None:
67
+ raise ImportError(
68
+ """The 'statsbombpy' package is required. Install with 'pip install statsbombpy'."""
69
+ )
70
+ self._creds = creds or sb.DEFAULT_CREDS
71
+ self._local = False
72
+ elif getter == "local":
73
+ if root is None:
74
+ raise ValueError("""The 'root' parameter is required when loading local data.""")
75
+ self._local = True
76
+ self._root = root
77
+ else:
78
+ raise ValueError("Invalid getter specified")
79
+
80
+ def competitions(self) -> DataFrame[StatsBombCompetitionSchema]:
81
+ """Return a dataframe with all available competitions and seasons.
82
+
83
+ Raises
84
+ ------
85
+ ParseError
86
+ When the raw data does not adhere to the expected format.
87
+
88
+ Returns
89
+ -------
90
+ pd.DataFrame
91
+ A dataframe containing all available competitions and seasons. See
92
+ :class:`~socceraction.spadl.statsbomb.StatsBombCompetitionSchema` for the schema.
93
+ """
94
+ cols = [
95
+ "season_id",
96
+ "competition_id",
97
+ "competition_name",
98
+ "country_name",
99
+ "competition_gender",
100
+ "season_name",
101
+ ]
102
+ if self._local:
103
+ obj = _localloadjson(str(os.path.join(self._root, "competitions.json")))
104
+ else:
105
+ obj = list(sb.competitions(fmt="dict", creds=self._creds).values())
106
+ if not isinstance(obj, list):
107
+ raise ParseError("The retrieved data should contain a list of competitions")
108
+ if len(obj) == 0:
109
+ return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(columns=cols))
110
+ return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(obj)[cols])
111
+
112
+ def games(self, competition_id: int, season_id: int) -> DataFrame[StatsBombGameSchema]:
113
+ """Return a dataframe with all available games in a season.
114
+
115
+ Parameters
116
+ ----------
117
+ competition_id : int
118
+ The ID of the competition.
119
+ season_id : int
120
+ The ID of the season.
121
+
122
+ Raises
123
+ ------
124
+ ParseError
125
+ When the raw data does not adhere to the expected format.
126
+
127
+ Returns
128
+ -------
129
+ pd.DataFrame
130
+ A dataframe containing all available games. See
131
+ :class:`~socceraction.spadl.statsbomb.StatsBombGameSchema` for the schema.
132
+ """
133
+ cols = [
134
+ "game_id",
135
+ "season_id",
136
+ "competition_id",
137
+ "competition_stage",
138
+ "game_day",
139
+ "game_date",
140
+ "home_team_id",
141
+ "away_team_id",
142
+ "home_score",
143
+ "away_score",
144
+ "venue",
145
+ "referee",
146
+ ]
147
+ if self._local:
148
+ obj = _localloadjson(
149
+ str(os.path.join(self._root, "matches", f"{competition_id}", f"{season_id}.json"))
150
+ )
151
+ else:
152
+ obj = list(
153
+ sb.matches(competition_id, season_id, fmt="dict", creds=self._creds).values()
154
+ )
155
+ if not isinstance(obj, list):
156
+ raise ParseError("The retrieved data should contain a list of games")
157
+ if len(obj) == 0:
158
+ return cast(DataFrame[StatsBombGameSchema], pd.DataFrame(columns=cols))
159
+ gamesdf = pd.DataFrame(_flatten(m) for m in obj)
160
+ gamesdf["kick_off"] = gamesdf["kick_off"].fillna("12:00:00.000")
161
+ gamesdf["match_date"] = pd.to_datetime(
162
+ gamesdf[["match_date", "kick_off"]].agg(" ".join, axis=1)
163
+ )
164
+ gamesdf.rename(
165
+ columns={
166
+ "match_id": "game_id",
167
+ "match_date": "game_date",
168
+ "match_week": "game_day",
169
+ "stadium_name": "venue",
170
+ "referee_name": "referee",
171
+ "competition_stage_name": "competition_stage",
172
+ },
173
+ inplace=True,
174
+ )
175
+ if "venue" not in gamesdf:
176
+ gamesdf["venue"] = None
177
+ if "referee" not in gamesdf:
178
+ gamesdf["referee"] = None
179
+ return cast(DataFrame[StatsBombGameSchema], gamesdf[cols])
180
+
181
+ def _lineups(self, game_id: int) -> list[dict[str, Any]]:
182
+ if self._local:
183
+ obj = _localloadjson(str(os.path.join(self._root, "lineups", f"{game_id}.json")))
184
+ else:
185
+ obj = list(sb.lineups(game_id, fmt="dict", creds=self._creds).values())
186
+ if not isinstance(obj, list):
187
+ raise ParseError("The retrieved data should contain a list of teams")
188
+ if len(obj) != 2:
189
+ raise ParseError("The retrieved data should contain two teams")
190
+ return obj
191
+
192
+ def teams(self, game_id: int) -> DataFrame[StatsBombTeamSchema]:
193
+ """Return a dataframe with both teams that participated in a game.
194
+
195
+ Parameters
196
+ ----------
197
+ game_id : int
198
+ The ID of the game.
199
+
200
+ Raises
201
+ ------
202
+ ParseError # noqa: DAR402
203
+ When the raw data does not adhere to the expected format.
204
+
205
+ Returns
206
+ -------
207
+ pd.DataFrame
208
+ A dataframe containing both teams. See
209
+ :class:`~socceraction.spadl.statsbomb.StatsBombTeamSchema` for the schema.
210
+ """
211
+ cols = ["team_id", "team_name"]
212
+ obj = self._lineups(game_id)
213
+ return cast(DataFrame[StatsBombTeamSchema], pd.DataFrame(obj)[cols])
214
+
215
+ def players(self, game_id: int) -> DataFrame[StatsBombPlayerSchema]:
216
+ """Return a dataframe with all players that participated in a game.
217
+
218
+ Parameters
219
+ ----------
220
+ game_id : int
221
+ The ID of the game.
222
+
223
+ Raises
224
+ ------
225
+ ParseError # noqa: DAR402
226
+ When the raw data does not adhere to the expected format.
227
+
228
+ Returns
229
+ -------
230
+ pd.DataFrame
231
+ A dataframe containing all players. See
232
+ :class:`~socceraction.spadl.statsbomb.StatsBombPlayerSchema` for the schema.
233
+ """
234
+ cols = [
235
+ "game_id",
236
+ "team_id",
237
+ "player_id",
238
+ "player_name",
239
+ "nickname",
240
+ "jersey_number",
241
+ "is_starter",
242
+ "starting_position_id",
243
+ "starting_position_name",
244
+ "minutes_played",
245
+ ]
246
+
247
+ obj = self._lineups(game_id)
248
+ playersdf = pd.DataFrame(_flatten_id(p) for lineup in obj for p in lineup["lineup"])
249
+ playergamesdf = extract_player_games(self.events(game_id))
250
+ playersdf = pd.merge(
251
+ playersdf,
252
+ playergamesdf[
253
+ ["player_id", "team_id", "position_id", "position_name", "minutes_played"]
254
+ ],
255
+ on="player_id",
256
+ )
257
+ playersdf["game_id"] = game_id
258
+ playersdf["position_name"] = playersdf["position_name"].replace(0, "Substitute")
259
+ playersdf["position_id"] = playersdf["position_id"].fillna(0).astype(int)
260
+ playersdf["is_starter"] = playersdf["position_id"] != 0
261
+ playersdf.rename(
262
+ columns={
263
+ "player_nickname": "nickname",
264
+ "country_name": "country",
265
+ "position_id": "starting_position_id",
266
+ "position_name": "starting_position_name",
267
+ },
268
+ inplace=True,
269
+ )
270
+ return cast(DataFrame[StatsBombPlayerSchema], playersdf[cols])
271
+
272
+ def events(self, game_id: int, load_360: bool = False) -> DataFrame[StatsBombEventSchema]:
273
+ """Return a dataframe with the event stream of a game.
274
+
275
+ Parameters
276
+ ----------
277
+ game_id : int
278
+ The ID of the game.
279
+ load_360 : bool
280
+ Whether to load the 360 data.
281
+
282
+ Raises
283
+ ------
284
+ ParseError
285
+ When the raw data does not adhere to the expected format.
286
+
287
+ Returns
288
+ -------
289
+ pd.DataFrame
290
+ A dataframe containing the event stream. See
291
+ :class:`~socceraction.spadl.statsbomb.StatsBombEventSchema` for the schema.
292
+ """
293
+ cols = [
294
+ "game_id",
295
+ "event_id",
296
+ "period_id",
297
+ "team_id",
298
+ "player_id",
299
+ "type_id",
300
+ "type_name",
301
+ "index",
302
+ "timestamp",
303
+ "minute",
304
+ "second",
305
+ "possession",
306
+ "possession_team_id",
307
+ "possession_team_name",
308
+ "play_pattern_id",
309
+ "play_pattern_name",
310
+ "team_name",
311
+ "duration",
312
+ "extra",
313
+ "related_events",
314
+ "player_name",
315
+ "position_id",
316
+ "position_name",
317
+ "location",
318
+ "under_pressure",
319
+ "counterpress",
320
+ ]
321
+ # Load the events
322
+ if self._local:
323
+ obj = _localloadjson(str(os.path.join(self._root, "events", f"{game_id}.json")))
324
+ else:
325
+ obj = list(sb.events(game_id, fmt="dict", creds=self._creds).values())
326
+ if not isinstance(obj, list):
327
+ raise ParseError("The retrieved data should contain a list of events")
328
+ if len(obj) == 0:
329
+ return cast(DataFrame[StatsBombEventSchema], pd.DataFrame(columns=cols))
330
+
331
+ eventsdf = pd.DataFrame(_flatten_id(e) for e in obj)
332
+ eventsdf["match_id"] = game_id
333
+ eventsdf["timestamp"] = pd.to_timedelta(eventsdf["timestamp"])
334
+ eventsdf["related_events"] = eventsdf["related_events"].apply(
335
+ lambda d: d if isinstance(d, list) else []
336
+ )
337
+ eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool)
338
+ eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool)
339
+ eventsdf.rename(
340
+ columns={"id": "event_id", "period": "period_id", "match_id": "game_id"},
341
+ inplace=True,
342
+ )
343
+ if not load_360:
344
+ return cast(DataFrame[StatsBombEventSchema], eventsdf[cols])
345
+
346
+ # Load the 360 data
347
+ cols_360 = ["visible_area_360", "freeze_frame_360"]
348
+ if self._local:
349
+ obj = _localloadjson(str(os.path.join(self._root, "three-sixty", f"{game_id}.json")))
350
+ else:
351
+ obj = sb.frames(game_id, fmt="dict", creds=self._creds)
352
+ if not isinstance(obj, list):
353
+ raise ParseError("The retrieved data should contain a list of frames")
354
+ if len(obj) == 0:
355
+ eventsdf["visible_area_360"] = None
356
+ eventsdf["freeze_frame_360"] = None
357
+ return cast(DataFrame[StatsBombEventSchema], eventsdf[cols + cols_360])
358
+ framesdf = pd.DataFrame(obj).rename(
359
+ columns={
360
+ "event_uuid": "event_id",
361
+ "visible_area": "visible_area_360",
362
+ "freeze_frame": "freeze_frame_360",
363
+ },
364
+ )[["event_id", "visible_area_360", "freeze_frame_360"]]
365
+ return cast(
366
+ DataFrame[StatsBombEventSchema],
367
+ pd.merge(eventsdf, framesdf, on="event_id", how="left")[cols + cols_360],
368
+ )
369
+
370
+
371
+ def extract_player_games(events: pd.DataFrame) -> pd.DataFrame:
372
+ """Extract player games [player_id, game_id, minutes_played] from statsbomb match events.
373
+
374
+ Parameters
375
+ ----------
376
+ events : pd.DataFrame
377
+ DataFrame containing StatsBomb events of a single game.
378
+
379
+ Returns
380
+ -------
381
+ player_games : pd.DataFrame
382
+ A DataFrame with the number of minutes played by each player during the game.
383
+ """
384
+ # get duration of each period
385
+ periods = pd.DataFrame(
386
+ [
387
+ {"period_id": 1, "minute": 45},
388
+ {"period_id": 2, "minute": 45},
389
+ {"period_id": 3, "minute": 15},
390
+ {"period_id": 4, "minute": 15},
391
+ # Shoot-outs should not contritbute to minutes played
392
+ # {"period_id": 5, "minute": 0},
393
+ ]
394
+ ).set_index("period_id")
395
+ periods_minutes = (
396
+ events.loc[events.type_name == "Half End", ["period_id", "minute"]]
397
+ .drop_duplicates()
398
+ .set_index("period_id")
399
+ .sort_index()
400
+ .subtract(periods.cumsum().shift(1).fillna(0))
401
+ .minute.dropna()
402
+ .astype(int)
403
+ .tolist()
404
+ )
405
+ # get duration of entire match
406
+ game_minutes = sum(periods_minutes)
407
+
408
+ game_id = events.game_id.mode().values[0]
409
+ players = {}
410
+ # Red cards
411
+ red_cards = events[
412
+ events.apply(
413
+ lambda x: any(
414
+ e in x.extra
415
+ and "card" in x.extra[e]
416
+ and x.extra[e]["card"]["name"] in ["Second Yellow", "Red Card"]
417
+ for e in ["foul_committed", "bad_behaviour"]
418
+ ),
419
+ axis=1,
420
+ )
421
+ ]
422
+ # stats for starting XI
423
+ for startxi in events[events.type_name == "Starting XI"].itertuples():
424
+ team_id, team_name = startxi.team_id, startxi.team_name
425
+ for player in startxi.extra["tactics"]["lineup"]:
426
+ player = _flatten_id(player)
427
+ player = {
428
+ **player,
429
+ **{
430
+ "game_id": game_id,
431
+ "team_id": team_id,
432
+ "team_name": team_name,
433
+ "minutes_played": game_minutes,
434
+ },
435
+ }
436
+ player_red_card = red_cards[red_cards.player_id == player["player_id"]]
437
+ if len(player_red_card) > 0:
438
+ red_card_minute = player_red_card.iloc[0].minute
439
+ player["minutes_played"] = _expand_minute(red_card_minute, periods_minutes)
440
+ players[player["player_id"]] = player
441
+ # stats for substitutions
442
+ for substitution in events[events.type_name == "Substitution"].itertuples():
443
+ exp_sub_minute = _expand_minute(substitution.minute, periods_minutes)
444
+ replacement = {
445
+ "player_id": substitution.extra["substitution"]["replacement"]["id"],
446
+ "player_name": substitution.extra["substitution"]["replacement"]["name"],
447
+ "minutes_played": game_minutes - exp_sub_minute,
448
+ "team_id": substitution.team_id,
449
+ "game_id": game_id,
450
+ "team_name": substitution.team_name,
451
+ }
452
+ player_red_card = red_cards[red_cards.player_id == replacement["player_id"]]
453
+ if len(player_red_card) > 0:
454
+ red_card_minute = player_red_card.iloc[0].minute
455
+ replacement["minutes_played"] = (
456
+ _expand_minute(red_card_minute, periods_minutes) - exp_sub_minute
457
+ )
458
+ players[replacement["player_id"]] = replacement
459
+ players[substitution.player_id]["minutes_played"] = exp_sub_minute
460
+ pg = pd.DataFrame(players.values()).fillna(0)
461
+ for col in pg.columns:
462
+ if "_id" in col:
463
+ pg[col] = pg[col].astype(int) # pylint: disable=E1136,E1137
464
+ return pg
465
+
466
+
467
+ def _flatten_id(d: dict[str, dict[str, Any]]) -> dict[str, Any]:
468
+ newd = {}
469
+ extra = {}
470
+ for k, v in d.items():
471
+ if isinstance(v, dict):
472
+ if "id" in v and "name" in v:
473
+ newd[k + "_id"] = v["id"]
474
+ newd[k + "_name"] = v["name"]
475
+ else:
476
+ extra[k] = v
477
+ else:
478
+ newd[k] = v
479
+ newd["extra"] = extra
480
+ return newd
481
+
482
+
483
+ def _flatten(d: dict[str, dict[str, Any]]) -> dict[str, Any]:
484
+ newd = {}
485
+ for k, v in d.items():
486
+ if isinstance(v, dict):
487
+ if "id" in v and "name" in v:
488
+ newd[k + "_id"] = v["id"]
489
+ newd[k + "_name"] = v["name"]
490
+ newd[k + "_extra"] = {l: w for (l, w) in v.items() if l in ("id", "name")}
491
+ else:
492
+ newd = {**newd, **_flatten(v)}
493
+ else:
494
+ newd[k] = v
495
+ return newd
data/statsbomb/schema.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SPADL schema for StatsBomb data."""
2
+
3
+ from typing import Optional
4
+
5
+ import pandera as pa
6
+ from pandera.typing import Object, Series, Timedelta
7
+
8
+ from socceraction.data.schema import (
9
+ CompetitionSchema,
10
+ EventSchema,
11
+ GameSchema,
12
+ PlayerSchema,
13
+ TeamSchema,
14
+ )
15
+
16
+
17
+ class StatsBombCompetitionSchema(CompetitionSchema):
18
+ """Definition of a dataframe containing a list of competitions and seasons."""
19
+
20
+ country_name: Series[str]
21
+ """The name of the country the competition relates to."""
22
+ competition_gender: Series[str]
23
+ """The gender of the players competing in the competition."""
24
+
25
+
26
+ class StatsBombGameSchema(GameSchema):
27
+ """Definition of a dataframe containing a list of games."""
28
+
29
+ competition_stage: Series[str]
30
+ """The name of the phase of the competition this game is in."""
31
+ home_score: Series[int]
32
+ """The final score of the home team."""
33
+ away_score: Series[int]
34
+ """The final score of the away team."""
35
+ venue: Series[str] = pa.Field(nullable=True)
36
+ """The name of the stadium where the game was played."""
37
+ referee: Series[str] = pa.Field(nullable=True)
38
+ """The name of the referee."""
39
+
40
+
41
+ class StatsBombPlayerSchema(PlayerSchema):
42
+ """Definition of a dataframe containing the list of players of a game."""
43
+
44
+ nickname: Series[str] = pa.Field(nullable=True)
45
+ """The nickname of the player on the team."""
46
+ starting_position_id: Series[int]
47
+ """The unique identifier for the starting position of the player on the team."""
48
+ starting_position_name: Series[str]
49
+ """The name of the starting position of the player on the team."""
50
+
51
+
52
+ class StatsBombTeamSchema(TeamSchema):
53
+ """Definition of a dataframe containing the list of teams of a game."""
54
+
55
+
56
+ class StatsBombEventSchema(EventSchema):
57
+ """Definition of a dataframe containing event stream data of a game."""
58
+
59
+ index: Series[int]
60
+ """Sequence notation for the ordering of events within each match."""
61
+ timestamp: Series[Timedelta]
62
+ """Time in the match the event takes place, recorded to the millisecond."""
63
+ minute: Series[int]
64
+ """The minutes on the clock at the time of this event."""
65
+ second: Series[int] = pa.Field(ge=0, le=59)
66
+ """The second part of the timestamp."""
67
+ possession: Series[int]
68
+ """Indicates the current unique possession in the game."""
69
+ possession_team_id: Series[int]
70
+ """The ID of the team that started this possession in control of the ball."""
71
+ possession_team_name: Series[str]
72
+ """The name of the team that started this possession in control of the ball."""
73
+ play_pattern_id: Series[int]
74
+ """The ID of the play pattern relevant to this event."""
75
+ play_pattern_name: Series[str]
76
+ """The name of the play pattern relevant to this event."""
77
+ team_name: Series[str]
78
+ """The name of the team this event relates to."""
79
+ duration: Series[float] = pa.Field(nullable=True)
80
+ """If relevant, the length in seconds the event lasted."""
81
+ extra: Series[Object]
82
+ """A JSON string containing type-specific information."""
83
+ related_events: Series[Object]
84
+ """A comma separated list of the IDs of related events."""
85
+ player_name: Series[str] = pa.Field(nullable=True)
86
+ """The name of the player this event relates to."""
87
+ position_id: Series[float] = pa.Field(nullable=True)
88
+ """The ID of the position the player was in at the time of this event."""
89
+ position_name: Series[str] = pa.Field(nullable=True)
90
+ """The name of the position the player was in at the time of this event."""
91
+ location: Series[Object] = pa.Field(nullable=True)
92
+ """Array containing the x and y coordinates of the event."""
93
+ under_pressure: Series[bool] = pa.Field(nullable=True)
94
+ """Whether the action was performed while being pressured by an opponent."""
95
+ counterpress: Series[bool] = pa.Field(nullable=True)
96
+ """Pressing actions within 5 seconds of an open play turnover."""
97
+ visible_area_360: Optional[Series[Object]] = pa.Field(nullable=True)
98
+ """An array of coordinates describing the polygon visible to the camera / in the 360 frame."""
99
+ freeze_frame_360: Optional[Series[Object]] = pa.Field(nullable=True)
100
+ """An array of freeze frame objects."""
data/wyscout/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for loading Wyscout event data."""
2
+
3
+ __all__ = [
4
+ "PublicWyscoutLoader",
5
+ "WyscoutLoader",
6
+ "WyscoutCompetitionSchema",
7
+ "WyscoutGameSchema",
8
+ "WyscoutPlayerSchema",
9
+ "WyscoutTeamSchema",
10
+ "WyscoutEventSchema",
11
+ ]
12
+
13
+ from .loader import PublicWyscoutLoader, WyscoutLoader
14
+ from .schema import (
15
+ WyscoutCompetitionSchema,
16
+ WyscoutEventSchema,
17
+ WyscoutGameSchema,
18
+ WyscoutPlayerSchema,
19
+ WyscoutTeamSchema,
20
+ )
data/wyscout/loader.py ADDED
@@ -0,0 +1,849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements serializers for Wyscout data."""
2
+
3
+ import glob
4
+ import os
5
+ import re
6
+ import warnings
7
+ from pathlib import Path
8
+ from typing import Any, Callable, Optional, Union, cast
9
+ from urllib.error import HTTPError
10
+ from urllib.parse import urlparse
11
+ from urllib.request import urlopen, urlretrieve
12
+ from zipfile import ZipFile, is_zipfile
13
+
14
+ import pandas as pd # type: ignore
15
+ from pandera.typing import DataFrame
16
+
17
+ from ..base import (
18
+ EventDataLoader,
19
+ JSONType,
20
+ MissingDataError,
21
+ ParseError,
22
+ _auth_remoteloadjson,
23
+ _expand_minute,
24
+ _has_auth,
25
+ _localloadjson,
26
+ _remoteloadjson,
27
+ )
28
+ from .schema import (
29
+ WyscoutCompetitionSchema,
30
+ WyscoutEventSchema,
31
+ WyscoutGameSchema,
32
+ WyscoutPlayerSchema,
33
+ WyscoutTeamSchema,
34
+ )
35
+
36
+
37
+ class PublicWyscoutLoader(EventDataLoader):
38
+ """
39
+ Load the public Wyscout dataset.
40
+
41
+ This dataset is a public release of event stream data, collected by Wyscout
42
+ (https://wyscout.com/) containing all matches of the 2017/18 season of the
43
+ top-5 European leagues (La Liga, Serie A, Bundesliga, Premier League, Ligue
44
+ 1), the FIFA World Cup 2018, and UEFA Euro Cup 2016. For a detailed
45
+ description, see Pappalardo et al. [1]_.
46
+
47
+ Parameters
48
+ ----------
49
+ root : str
50
+ Path where a local copy of the dataset is stored or where the
51
+ downloaded dataset should be stored.
52
+ download : bool
53
+ Whether to force a redownload of the data.
54
+
55
+ References
56
+ ----------
57
+ .. [1] Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of
58
+ spatio-temporal match events in soccer competitions. Sci Data 6, 236
59
+ (2019). https://doi.org/10.1038/s41597-019-0247-7
60
+ """
61
+
62
+ def __init__(self, root: Optional[str] = None, download: bool = False) -> None:
63
+ if root is None:
64
+ self.root = os.path.join(os.getcwd(), "wyscout_data")
65
+ os.makedirs(self.root, exist_ok=True)
66
+ else:
67
+ self.root = root
68
+
69
+ self.get = _localloadjson
70
+
71
+ if download or len(os.listdir(self.root)) == 0:
72
+ self._download_repo()
73
+
74
+ self._index = pd.DataFrame(
75
+ [
76
+ {
77
+ "competition_id": 524,
78
+ "season_id": 181248,
79
+ "season_name": "2017/2018",
80
+ "db_matches": "matches_Italy.json",
81
+ "db_events": "events_Italy.json",
82
+ },
83
+ {
84
+ "competition_id": 364,
85
+ "season_id": 181150,
86
+ "season_name": "2017/2018",
87
+ "db_matches": "matches_England.json",
88
+ "db_events": "events_England.json",
89
+ },
90
+ {
91
+ "competition_id": 795,
92
+ "season_id": 181144,
93
+ "season_name": "2017/2018",
94
+ "db_matches": "matches_Spain.json",
95
+ "db_events": "events_Spain.json",
96
+ },
97
+ {
98
+ "competition_id": 412,
99
+ "season_id": 181189,
100
+ "season_name": "2017/2018",
101
+ "db_matches": "matches_France.json",
102
+ "db_events": "events_France.json",
103
+ },
104
+ {
105
+ "competition_id": 426,
106
+ "season_id": 181137,
107
+ "season_name": "2017/2018",
108
+ "db_matches": "matches_Germany.json",
109
+ "db_events": "events_Germany.json",
110
+ },
111
+ {
112
+ "competition_id": 102,
113
+ "season_id": 9291,
114
+ "season_name": "2016",
115
+ "db_matches": "matches_European_Championship.json",
116
+ "db_events": "events_European_Championship.json",
117
+ },
118
+ {
119
+ "competition_id": 28,
120
+ "season_id": 10078,
121
+ "season_name": "2018",
122
+ "db_matches": "matches_World_Cup.json",
123
+ "db_events": "events_World_Cup.json",
124
+ },
125
+ ]
126
+ ).set_index(["competition_id", "season_id"])
127
+ self._match_index = self._create_match_index().set_index("match_id")
128
+ self._cache: Optional[dict[str, Any]] = None
129
+
130
+ def _download_repo(self) -> None:
131
+ dataset_urls = {
132
+ "competitions": "https://ndownloader.figshare.com/files/15073685",
133
+ "teams": "https://ndownloader.figshare.com/files/15073697",
134
+ "players": "https://ndownloader.figshare.com/files/15073721",
135
+ "matches": "https://ndownloader.figshare.com/files/14464622",
136
+ "events": "https://ndownloader.figshare.com/files/14464685",
137
+ }
138
+ # download and unzip Wyscout open data
139
+ for url in dataset_urls.values():
140
+ url_obj = urlopen(url).geturl()
141
+ path = Path(urlparse(url_obj).path)
142
+ file_name = os.path.join(self.root, path.name)
143
+ file_local, _ = urlretrieve(url_obj, file_name)
144
+ if is_zipfile(file_local):
145
+ with ZipFile(file_local) as zip_file:
146
+ zip_file.extractall(self.root)
147
+
148
+ def _create_match_index(self) -> pd.DataFrame:
149
+ df_matches = pd.concat(
150
+ [pd.DataFrame(self.get(path)) for path in glob.iglob(f"{self.root}/matches_*.json")]
151
+ )
152
+ df_matches.rename(
153
+ columns={
154
+ "wyId": "match_id",
155
+ "competitionId": "competition_id",
156
+ "seasonId": "season_id",
157
+ },
158
+ inplace=True,
159
+ )
160
+ return pd.merge(
161
+ df_matches[["match_id", "competition_id", "season_id"]],
162
+ self._index,
163
+ on=["competition_id", "season_id"],
164
+ how="left",
165
+ )
166
+
167
+ def competitions(self) -> DataFrame[WyscoutCompetitionSchema]:
168
+ """Return a dataframe with all available competitions and seasons.
169
+
170
+ Returns
171
+ -------
172
+ pd.DataFrame
173
+ A dataframe containing all available competitions and seasons. See
174
+ :class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema.
175
+ """
176
+ path = os.path.join(self.root, "competitions.json")
177
+ df_competitions = pd.DataFrame(self.get(path))
178
+ df_competitions.rename(
179
+ columns={"wyId": "competition_id", "name": "competition_name"}, inplace=True
180
+ )
181
+ df_competitions["country_name"] = df_competitions.apply(
182
+ lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1
183
+ )
184
+ df_competitions["competition_gender"] = "male"
185
+ df_competitions = pd.merge(
186
+ df_competitions,
187
+ self._index.reset_index()[["competition_id", "season_id", "season_name"]],
188
+ on="competition_id",
189
+ how="left",
190
+ )
191
+ return cast(
192
+ DataFrame[WyscoutCompetitionSchema],
193
+ df_competitions.reset_index()[
194
+ [
195
+ "competition_id",
196
+ "season_id",
197
+ "country_name",
198
+ "competition_name",
199
+ "competition_gender",
200
+ "season_name",
201
+ ]
202
+ ],
203
+ )
204
+
205
+ def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]:
206
+ """Return a dataframe with all available games in a season.
207
+
208
+ Parameters
209
+ ----------
210
+ competition_id : int
211
+ The ID of the competition.
212
+ season_id : int
213
+ The ID of the season.
214
+
215
+ Returns
216
+ -------
217
+ pd.DataFrame
218
+ A dataframe containing all available games. See
219
+ :class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema.
220
+ """
221
+ path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"])
222
+ df_matches = pd.DataFrame(self.get(path))
223
+ return cast(DataFrame[WyscoutGameSchema], _convert_games(df_matches))
224
+
225
+ def _lineups(self, game_id: int) -> list[dict[str, Any]]:
226
+ competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
227
+ path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"])
228
+ df_matches = pd.DataFrame(self.get(path)).set_index("wyId")
229
+ return list(df_matches.at[game_id, "teamsData"].values())
230
+
231
+ def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]:
232
+ """Return a dataframe with both teams that participated in a game.
233
+
234
+ Parameters
235
+ ----------
236
+ game_id : int
237
+ The ID of the game.
238
+
239
+ Returns
240
+ -------
241
+ pd.DataFrame
242
+ A dataframe containing both teams. See
243
+ :class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema.
244
+ """
245
+ path = os.path.join(self.root, "teams.json")
246
+ df_teams = pd.DataFrame(self.get(path)).set_index("wyId")
247
+ df_teams_match_id = pd.DataFrame(self._lineups(game_id))["teamId"]
248
+ df_teams_match = df_teams.loc[df_teams_match_id].reset_index()
249
+ return cast(DataFrame[WyscoutTeamSchema], _convert_teams(df_teams_match))
250
+
251
+ def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]:
252
+ """Return a dataframe with all players that participated in a game.
253
+
254
+ Parameters
255
+ ----------
256
+ game_id : int
257
+ The ID of the game.
258
+
259
+ Returns
260
+ -------
261
+ pd.DataFrame
262
+ A dataframe containing all players. See
263
+ :class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema.
264
+ """
265
+ path = os.path.join(self.root, "players.json")
266
+ df_players = pd.DataFrame(self.get(path)).set_index("wyId")
267
+ lineups = self._lineups(game_id)
268
+ players_match = []
269
+ for team in lineups:
270
+ playerlist = team["formation"]["lineup"]
271
+ if team["formation"]["substitutions"] != "null":
272
+ for p in team["formation"]["substitutions"]:
273
+ try:
274
+ playerlist.append(
275
+ next(
276
+ item
277
+ for item in team["formation"]["bench"]
278
+ if item["playerId"] == p["playerIn"]
279
+ )
280
+ )
281
+ except StopIteration:
282
+ warnings.warn(
283
+ f'A player with ID={p["playerIn"]} was substituted '
284
+ f'in the {p["minute"]}th minute of game {game_id}, but '
285
+ "could not be found on the bench."
286
+ )
287
+ df = pd.DataFrame(playerlist)
288
+ df["side"] = team["side"]
289
+ df["team_id"] = team["teamId"]
290
+ players_match.append(df)
291
+ df_players_match = (
292
+ pd.concat(players_match)
293
+ .rename(columns={"playerId": "wyId"})
294
+ .set_index("wyId")
295
+ .join(df_players, how="left")
296
+ )
297
+ df_players_match.reset_index(inplace=True)
298
+ for c in ["shortName", "lastName", "firstName"]:
299
+ df_players_match[c] = df_players_match[c].apply(
300
+ lambda x: x.encode().decode("unicode-escape")
301
+ )
302
+ df_players_match = _convert_players(df_players_match)
303
+
304
+ # get minutes played
305
+ competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
306
+ path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"])
307
+ if self._cache is not None and self._cache["path"] == path:
308
+ df_events = self._cache["events"]
309
+ else:
310
+ df_events = pd.DataFrame(self.get(path)).set_index("matchId")
311
+ # avoid that this large json file has to be parsed again for
312
+ # each game when loading a batch of games from the same season
313
+ self._cache = {"path": path, "events": df_events}
314
+ match_events = df_events.loc[game_id].reset_index().to_dict("records")
315
+ mp = _get_minutes_played(lineups, match_events)
316
+ df_players_match = pd.merge(df_players_match, mp, on="player_id", how="right")
317
+ df_players_match["minutes_played"] = df_players_match.minutes_played.fillna(0)
318
+ df_players_match["game_id"] = game_id
319
+ return cast(DataFrame[WyscoutPlayerSchema], df_players_match)
320
+
321
+ def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]:
322
+ """Return a dataframe with the event stream of a game.
323
+
324
+ Parameters
325
+ ----------
326
+ game_id : int
327
+ The ID of the game.
328
+
329
+ Returns
330
+ -------
331
+ pd.DataFrame
332
+ A dataframe containing the event stream. See
333
+ :class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema.
334
+ """
335
+ competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]]
336
+ path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"])
337
+ if self._cache is not None and self._cache["path"] == path:
338
+ df_events = self._cache["events"]
339
+ else:
340
+ df_events = pd.DataFrame(self.get(path)).set_index("matchId")
341
+ # avoid that this large json file has to be parsed again for
342
+ # each game when loading a batch of games from the same season
343
+ self._cache = {"path": path, "events": df_events}
344
+ return cast(
345
+ DataFrame[WyscoutEventSchema], _convert_events(df_events.loc[game_id].reset_index())
346
+ )
347
+
348
+
349
+ class WyscoutLoader(EventDataLoader):
350
+ """Load event data either from a remote location or from a local folder.
351
+
352
+ Parameters
353
+ ----------
354
+ root : str
355
+ Root-path of the data.
356
+ getter : str or callable, default: "remote"
357
+ "remote", "local" or a function that returns loads JSON data from a path.
358
+ feeds : dict(str, str)
359
+ Glob pattern for each feed that should be parsed. The default feeds for
360
+ a "remote" getter are::
361
+
362
+ {
363
+ 'competitions': 'competitions',
364
+ 'seasons': 'competitions/{season_id}/seasons',
365
+ 'games': 'seasons/{season_id}/matches',
366
+ 'events': 'matches/{game_id}/events?fetch=teams,players,match,substitutions'
367
+ }
368
+
369
+ The default feeds for a "local" getter are::
370
+
371
+ {
372
+ 'competitions': 'competitions.json',
373
+ 'seasons': 'seasons_{competition_id}.json',
374
+ 'games': 'matches_{season_id}.json',
375
+ 'events': 'matches/events_{game_id}.json',
376
+ }
377
+
378
+ creds: dict, optional
379
+ Login credentials in the format {"user": "", "passwd": ""}. Only used
380
+ when getter is "remote".
381
+ """
382
+
383
+ _wyscout_api: str = "https://apirest.wyscout.com/v2/"
384
+
385
+ def __init__(
386
+ self,
387
+ root: str = _wyscout_api,
388
+ getter: Union[str, Callable[[str], JSONType]] = "remote",
389
+ feeds: Optional[dict[str, str]] = None,
390
+ creds: Optional[dict[str, str]] = None,
391
+ ) -> None:
392
+ self.root = root
393
+
394
+ # Init credentials
395
+ if creds is None:
396
+ creds = {
397
+ "user": os.environ.get("WY_USERNAME", ""),
398
+ "passwd": os.environ.get("WY_PASSWORD", ""),
399
+ }
400
+
401
+ # Init getter
402
+ if getter == "remote":
403
+ self.get = _remoteloadjson
404
+ if _has_auth(creds):
405
+ _auth_remoteloadjson(creds["user"], creds["passwd"])
406
+ elif getter == "local":
407
+ self.get = _localloadjson
408
+ else:
409
+ self.get = getter # type: ignore
410
+
411
+ # Set up feeds
412
+ if feeds is not None:
413
+ self.feeds = feeds
414
+ elif getter == "remote":
415
+ self.feeds = {
416
+ "seasons": "competitions/{competition_id}/seasons?fetch=competition",
417
+ "games": "seasons/{season_id}/matches",
418
+ "events": "matches/{game_id}/events?fetch=teams,players,match,coaches,referees,formations,substitutions", # noqa: B950
419
+ }
420
+ elif getter == "local":
421
+ self.feeds = {
422
+ "competitions": "competitions.json",
423
+ "seasons": "seasons_{competition_id}.json",
424
+ "games": "matches_{season_id}.json",
425
+ "events": "matches/events_{game_id}.json",
426
+ }
427
+ else:
428
+ raise ValueError("No feeds specified.")
429
+
430
+ def _get_file_or_url(
431
+ self,
432
+ feed: str,
433
+ competition_id: Optional[int] = None,
434
+ season_id: Optional[int] = None,
435
+ game_id: Optional[int] = None,
436
+ ) -> list[str]:
437
+ competition_id_glob = "*" if competition_id is None else competition_id
438
+ season_id_glob = "*" if season_id is None else season_id
439
+ game_id_glob = "*" if game_id is None else game_id
440
+ glob_pattern = self.feeds[feed].format(
441
+ competition_id=competition_id_glob, season_id=season_id_glob, game_id=game_id_glob
442
+ )
443
+ if "*" in glob_pattern:
444
+ files = glob.glob(os.path.join(self.root, glob_pattern))
445
+ if len(files) == 0:
446
+ raise MissingDataError
447
+ return files
448
+ return [glob_pattern]
449
+
450
+ def competitions(
451
+ self, competition_id: Optional[int] = None
452
+ ) -> DataFrame[WyscoutCompetitionSchema]:
453
+ """Return a dataframe with all available competitions and seasons.
454
+
455
+ Parameters
456
+ ----------
457
+ competition_id : int, optional
458
+ The ID of the competition.
459
+
460
+ Raises
461
+ ------
462
+ ParseError
463
+ When the raw data does not adhere to the expected format.
464
+
465
+ Returns
466
+ -------
467
+ pd.DataFrame
468
+ A dataframe containing all available competitions and seasons. See
469
+ :class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema.
470
+ """
471
+ # Get all competitions
472
+ if "competitions" in self.feeds:
473
+ competitions_url = self._get_file_or_url("competitions")[0]
474
+ path = os.path.join(self.root, competitions_url)
475
+ obj = self.get(path)
476
+ if not isinstance(obj, dict) or "competitions" not in obj:
477
+ raise ParseError(f"{path} should contain a list of competitions")
478
+ seasons_urls = [
479
+ self._get_file_or_url("seasons", competition_id=c["wyId"])[0]
480
+ for c in obj["competitions"]
481
+ ]
482
+ else:
483
+ seasons_urls = self._get_file_or_url("seasons", competition_id=competition_id)
484
+ # Get seasons in each competition
485
+ competitions = []
486
+ seasons = []
487
+ for seasons_url in seasons_urls:
488
+ try:
489
+ path = os.path.join(self.root, seasons_url)
490
+ obj = self.get(path)
491
+ if not isinstance(obj, dict) or "competition" not in obj or "seasons" not in obj:
492
+ raise ParseError(
493
+ f"{path} should contain a list of competition and list of seasons"
494
+ )
495
+ competitions.append(obj["competition"])
496
+ seasons.extend([s["season"] for s in obj["seasons"]])
497
+ except FileNotFoundError:
498
+ warnings.warn(f"File not found: {seasons_url}")
499
+ df_competitions = _convert_competitions(pd.DataFrame(competitions))
500
+ df_seasons = _convert_seasons(pd.DataFrame(seasons))
501
+ # Merge into a single dataframe
502
+ return cast(
503
+ DataFrame[WyscoutCompetitionSchema],
504
+ pd.merge(df_competitions, df_seasons, on="competition_id"),
505
+ )
506
+
507
+ def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]:
508
+ """Return a dataframe with all available games in a season.
509
+
510
+ Parameters
511
+ ----------
512
+ competition_id : int
513
+ The ID of the competition.
514
+ season_id : int
515
+ The ID of the season.
516
+
517
+ Raises
518
+ ------
519
+ ParseError
520
+ When the raw data does not adhere to the expected format.
521
+
522
+ Returns
523
+ -------
524
+ pd.DataFrame
525
+ A dataframe containing all available games. See
526
+ :class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema.
527
+ """
528
+ # Get all games
529
+ if "games" in self.feeds:
530
+ games_url = self._get_file_or_url(
531
+ "games", competition_id=competition_id, season_id=season_id
532
+ )[0]
533
+ path = os.path.join(self.root, games_url)
534
+ obj = self.get(path)
535
+ if not isinstance(obj, dict) or "matches" not in obj:
536
+ raise ParseError(f"{path} should contain a list of matches")
537
+ gamedetails_urls = [
538
+ self._get_file_or_url(
539
+ "events",
540
+ competition_id=competition_id,
541
+ season_id=season_id,
542
+ game_id=g["matchId"],
543
+ )[0]
544
+ for g in obj["matches"]
545
+ ]
546
+ else:
547
+ gamedetails_urls = self._get_file_or_url(
548
+ "events", competition_id=competition_id, season_id=season_id
549
+ )
550
+ games = []
551
+ for gamedetails_url in gamedetails_urls:
552
+ try:
553
+ path = os.path.join(self.root, gamedetails_url)
554
+ obj = self.get(path)
555
+ if not isinstance(obj, dict) or "match" not in obj:
556
+ raise ParseError(f"{path} should contain a match")
557
+ games.append(obj["match"])
558
+ except FileNotFoundError:
559
+ warnings.warn(f"File not found: {gamedetails_url}")
560
+ except HTTPError:
561
+ warnings.warn(f"Resource not found: {gamedetails_url}")
562
+ df_games = _convert_games(pd.DataFrame(games))
563
+ return cast(DataFrame[WyscoutGameSchema], df_games)
564
+
565
+ def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]:
566
+ """Return a dataframe with both teams that participated in a game.
567
+
568
+ Parameters
569
+ ----------
570
+ game_id : int
571
+ The ID of the game.
572
+
573
+ Raises
574
+ ------
575
+ ParseError
576
+ When the raw data does not adhere to the expected format.
577
+
578
+ Returns
579
+ -------
580
+ pd.DataFrame
581
+ A dataframe containing both teams. See
582
+ :class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema.
583
+ """
584
+ events_url = self._get_file_or_url("events", game_id=game_id)[0]
585
+ path = os.path.join(self.root, events_url)
586
+ obj = self.get(path)
587
+ if not isinstance(obj, dict) or "teams" not in obj:
588
+ raise ParseError(f"{path} should contain a list of matches")
589
+ teams = [t["team"] for t in obj["teams"].values() if t.get("team")]
590
+ df_teams = _convert_teams(pd.DataFrame(teams))
591
+ return cast(DataFrame[WyscoutTeamSchema], df_teams)
592
+
593
+ def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]:
594
+ """Return a dataframe with all players that participated in a game.
595
+
596
+ Parameters
597
+ ----------
598
+ game_id : int
599
+ The ID of the game.
600
+
601
+ Raises
602
+ ------
603
+ ParseError
604
+ When the raw data does not adhere to the expected format.
605
+
606
+ Returns
607
+ -------
608
+ pd.DataFrame
609
+ A dataframe containing all players. See
610
+ :class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema.
611
+ """
612
+ events_url = self._get_file_or_url("events", game_id=game_id)[0]
613
+ path = os.path.join(self.root, events_url)
614
+ obj = self.get(path)
615
+ if not isinstance(obj, dict) or "players" not in obj:
616
+ raise ParseError(f"{path} should contain a list of players")
617
+ players = [
618
+ player["player"]
619
+ for team in obj["players"].values()
620
+ for player in team
621
+ if player.get("player")
622
+ ]
623
+ df_players = _convert_players(pd.DataFrame(players).drop_duplicates("wyId"))
624
+ df_players = pd.merge(
625
+ df_players,
626
+ _get_minutes_played(obj["match"]["teamsData"], obj["events"]),
627
+ on="player_id",
628
+ how="right",
629
+ )
630
+ df_players["minutes_played"] = df_players.minutes_played.fillna(0)
631
+ df_players["game_id"] = game_id
632
+ return cast(DataFrame[WyscoutPlayerSchema], df_players)
633
+
634
+ def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]:
635
+ """Return a dataframe with the event stream of a game.
636
+
637
+ Parameters
638
+ ----------
639
+ game_id : int
640
+ The ID of the game.
641
+
642
+ Raises
643
+ ------
644
+ ParseError
645
+ When the raw data does not adhere to the expected format.
646
+
647
+ Returns
648
+ -------
649
+ pd.DataFrame
650
+ A dataframe containing the event stream. See
651
+ :class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema.
652
+ """
653
+ events_url = self._get_file_or_url("events", game_id=game_id)[0]
654
+ path = os.path.join(self.root, events_url)
655
+ obj = self.get(path)
656
+ if not isinstance(obj, dict) or "events" not in obj:
657
+ raise ParseError(f"{path} should contain a list of events")
658
+ df_events = _convert_events(pd.DataFrame(obj["events"]))
659
+ return cast(DataFrame[WyscoutEventSchema], df_events)
660
+
661
+
662
+ def _convert_competitions(competitions: pd.DataFrame) -> pd.DataFrame:
663
+ competitionsmapping = {
664
+ "wyId": "competition_id",
665
+ "name": "competition_name",
666
+ "gender": "competition_gender",
667
+ }
668
+ cols = ["competition_id", "competition_name", "country_name", "competition_gender"]
669
+ competitions["country_name"] = competitions.apply(
670
+ lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1
671
+ )
672
+ competitions = competitions.rename(columns=competitionsmapping)[cols]
673
+ return competitions
674
+
675
+
676
+ def _convert_seasons(seasons: pd.DataFrame) -> pd.DataFrame:
677
+ seasonsmapping = {
678
+ "wyId": "season_id",
679
+ "name": "season_name",
680
+ "competitionId": "competition_id",
681
+ }
682
+ cols = ["season_id", "season_name", "competition_id"]
683
+ seasons = seasons.rename(columns=seasonsmapping)[cols]
684
+ return seasons
685
+
686
+
687
+ def _convert_games(matches: pd.DataFrame) -> pd.DataFrame:
688
+ gamesmapping = {
689
+ "wyId": "game_id",
690
+ "dateutc": "game_date",
691
+ "competitionId": "competition_id",
692
+ "seasonId": "season_id",
693
+ "gameweek": "game_day",
694
+ }
695
+ cols = ["game_id", "competition_id", "season_id", "game_date", "game_day"]
696
+ games = matches.rename(columns=gamesmapping)[cols]
697
+ games["game_date"] = pd.to_datetime(games["game_date"])
698
+ games["home_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "home"))
699
+ games["away_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "away"))
700
+ return games
701
+
702
+
703
+ def _get_team_id(teamsData: dict[int, Any], side: str) -> int:
704
+ for team_id, data in teamsData.items():
705
+ if data["side"] == side:
706
+ return int(team_id)
707
+ raise ValueError()
708
+
709
+
710
+ def _convert_players(players: pd.DataFrame) -> pd.DataFrame:
711
+ playermapping = {
712
+ "wyId": "player_id",
713
+ "shortName": "nickname",
714
+ "firstName": "firstname",
715
+ "lastName": "lastname",
716
+ "birthDate": "birth_date",
717
+ }
718
+ cols = ["player_id", "nickname", "firstname", "lastname", "birth_date"]
719
+ df_players = players.rename(columns=playermapping)[cols]
720
+ df_players["player_name"] = df_players[["firstname", "lastname"]].agg(" ".join, axis=1)
721
+ df_players["birth_date"] = pd.to_datetime(df_players["birth_date"])
722
+ return df_players
723
+
724
+
725
+ def _convert_teams(teams: pd.DataFrame) -> pd.DataFrame:
726
+ teammapping = {
727
+ "wyId": "team_id",
728
+ "name": "team_name_short",
729
+ "officialName": "team_name",
730
+ }
731
+ cols = ["team_id", "team_name_short", "team_name"]
732
+ return teams.rename(columns=teammapping)[cols]
733
+
734
+
735
+ def _convert_events(raw_events: pd.DataFrame) -> pd.DataFrame:
736
+ eventmapping = {
737
+ "id": "event_id",
738
+ "match_id": "game_id",
739
+ "event_name": "type_name",
740
+ "sub_event_name": "subtype_name",
741
+ }
742
+ cols = [
743
+ "event_id",
744
+ "game_id",
745
+ "period_id",
746
+ "milliseconds",
747
+ "team_id",
748
+ "player_id",
749
+ "type_id",
750
+ "type_name",
751
+ "subtype_id",
752
+ "subtype_name",
753
+ "positions",
754
+ "tags",
755
+ ]
756
+ events = raw_events.copy()
757
+ # Camel case to snake case column names
758
+ pattern = re.compile(r"(?<!^)(?=[A-Z])")
759
+ events.columns = [pattern.sub("_", c).lower() for c in events.columns]
760
+ #
761
+ events["type_id"] = (
762
+ pd.to_numeric(
763
+ events["event_id"] if "event_id" in events.columns else None, errors="coerce"
764
+ )
765
+ .fillna(0)
766
+ .astype(int)
767
+ )
768
+ del events["event_id"]
769
+ events["subtype_id"] = (
770
+ pd.to_numeric(
771
+ events["sub_event_id"] if "sub_event_id" in events.columns else None, errors="coerce"
772
+ )
773
+ .fillna(0)
774
+ .astype(int)
775
+ )
776
+ del events["sub_event_id"]
777
+ events["period_id"] = events.match_period.apply(lambda x: wyscout_periods[x])
778
+ events["milliseconds"] = events.event_sec * 1000
779
+ return events.rename(columns=eventmapping)[cols]
780
+
781
+
782
+ def _get_minutes_played(
783
+ teamsData: list[dict[str, Any]], events: list[dict[str, Any]]
784
+ ) -> pd.DataFrame:
785
+ # get duration of each period
786
+ periods_ts = {i: [0] for i in range(6)}
787
+ for e in events:
788
+ period_id = wyscout_periods[e["matchPeriod"]]
789
+ periods_ts[period_id].append(e["eventSec"])
790
+ periods_duration = [
791
+ round(max(periods_ts[i]) / 60) for i in range(5) if max(periods_ts[i]) != 0
792
+ ]
793
+ # get duration of entire match
794
+ duration = sum(periods_duration)
795
+
796
+ # get stats for each player
797
+ playergames: dict[int, dict[str, Any]] = {}
798
+ if isinstance(teamsData, dict):
799
+ teamsData = list(teamsData.values())
800
+ for teamData in teamsData:
801
+ formation = teamData.get("formation", {})
802
+ substitutions = formation.get("substitutions", [])
803
+ red_cards = {
804
+ player["playerId"]: _expand_minute(int(player["redCards"]), periods_duration)
805
+ for key in ["bench", "lineup"]
806
+ for player in formation.get(key, [])
807
+ if player["redCards"] != "0"
808
+ }
809
+ pg = {
810
+ player["playerId"]: {
811
+ "team_id": teamData["teamId"],
812
+ "player_id": player["playerId"],
813
+ "jersey_number": player.get("shirtNumber", 0),
814
+ "minutes_played": red_cards.get(player["playerId"], duration),
815
+ "is_starter": True,
816
+ }
817
+ for player in formation.get("lineup", [])
818
+ }
819
+
820
+ # correct minutes played for substituted players
821
+ if substitutions != "null":
822
+ for substitution in substitutions:
823
+ expanded_minute_sub = _expand_minute(substitution["minute"], periods_duration)
824
+ substitute = {
825
+ "team_id": teamData["teamId"],
826
+ "player_id": substitution["playerIn"],
827
+ "jersey_number": next(
828
+ (
829
+ p.get("shirtNumber", 0)
830
+ for p in formation.get("bench", [])
831
+ if p["playerId"] == substitution["playerIn"]
832
+ ),
833
+ 0,
834
+ ),
835
+ "minutes_played": duration - expanded_minute_sub,
836
+ "is_starter": False,
837
+ }
838
+ if substitution["playerIn"] in red_cards:
839
+ substitute["minutes_played"] = (
840
+ red_cards[substitution["playerIn"]] - expanded_minute_sub
841
+ )
842
+ pg[substitution["playerIn"]] = substitute
843
+ pg[substitution["playerOut"]]["minutes_played"] = expanded_minute_sub
844
+
845
+ playergames = {**playergames, **pg}
846
+ return pd.DataFrame(playergames.values())
847
+
848
+
849
+ wyscout_periods = {"1H": 1, "2H": 2, "E1": 3, "E2": 4, "P": 5}
data/wyscout/schema.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SPADL schema for Wyscout data."""
2
+
3
+ import pandera as pa
4
+ from pandera.typing import DateTime, Object, Series
5
+
6
+ from socceraction.data.schema import (
7
+ CompetitionSchema,
8
+ EventSchema,
9
+ GameSchema,
10
+ PlayerSchema,
11
+ TeamSchema,
12
+ )
13
+
14
+
15
+ class WyscoutCompetitionSchema(CompetitionSchema):
16
+ """Definition of a dataframe containing a list of competitions and seasons."""
17
+
18
+ country_name: Series[str]
19
+ competition_gender: Series[str]
20
+
21
+
22
+ class WyscoutGameSchema(GameSchema):
23
+ """Definition of a dataframe containing a list of games."""
24
+
25
+
26
+ class WyscoutPlayerSchema(PlayerSchema):
27
+ """Definition of a dataframe containing the list of teams of a game."""
28
+
29
+ firstname: Series[str]
30
+ lastname: Series[str]
31
+ nickname: Series[str] = pa.Field(nullable=True)
32
+ birth_date: Series[DateTime] = pa.Field(nullable=True)
33
+
34
+
35
+ class WyscoutTeamSchema(TeamSchema):
36
+ """Definition of a dataframe containing the list of players of a game."""
37
+
38
+ team_name_short: Series[str]
39
+
40
+
41
+ class WyscoutEventSchema(EventSchema):
42
+ """Definition of a dataframe containing event stream data of a game."""
43
+
44
+ milliseconds: Series[float]
45
+ subtype_id: Series[int]
46
+ subtype_name: Series[str]
47
+ positions: Series[Object]
48
+ tags: Series[Object]
docs/_static/custom.css ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a {
2
+ color: #26b079;
3
+ }
4
+
5
+ .toctree-l1 a:active,
6
+ .toctree-l1 a:hover {
7
+ background-color: #676767;
8
+ }
9
+
10
+ .sidebar-logo {
11
+ max-width: 100%;
12
+ }
13
+
14
+ .sidebar-drawer {
15
+ width: calc(50% - 25em);
16
+ min-width: 22em;
17
+ }
18
+
19
+ .sidebar-drawer .sidebar-container {
20
+ width: 23em;
21
+ }
22
+
23
+ li.toctree-l2 {
24
+ font-size: 80%;
25
+ }
26
+
27
+ @media (max-width: 67em) {
28
+ .sidebar-drawer {
29
+ width: 22em;
30
+ left: -22em;
31
+ }
32
+ .sidebar-drawer .sidebar-container {
33
+ width: 22em;
34
+ }
35
+ li.toctree-l2 {
36
+ font-size: 75%;
37
+ }
38
+ }
39
+
40
+ /* autosummary table text */
41
+ article .align-center,
42
+ article .align-default {
43
+ text-align: left;
44
+ }
45
+
46
+ table.dataframe {
47
+ font-size: 80%;
48
+ margin-left: 0;
49
+ margin-right: 0;
50
+ }
docs/_static/decroos19.bibtex ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @inproceedings{Decroos2019actions,
2
+ author = {Decroos, Tom and Bransen, Lotte and Van Haaren, Jan and Davis, Jesse},
3
+ title = {Actions Speak Louder Than Goals: Valuing Player Actions in Soccer},
4
+ booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
5
+ series = {KDD '19},
6
+ year = {2019},
7
+ isbn = {978-1-4503-6201-6},
8
+ location = {Anchorage, AK, USA},
9
+ pages = {1851--1861},
10
+ numpages = {11},
11
+ url = {http://doi.acm.org/10.1145/3292500.3330758},
12
+ doi = {10.1145/3292500.3330758},
13
+ acmid = {3330758},
14
+ publisher = {ACM},
15
+ address = {New York, NY, USA},
16
+ keywords = {event stream data, probabilistic classification, soccer match data, sports analytics, valuing actions},
17
+ }
docs/_static/favicon.png ADDED
docs/_static/logo.ai ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8f4add7db11daf2a6f1c77c8d26f84c0a227bea40e9b607f2930d15b75ae99e
3
+ size 153178
docs/_static/logo.png ADDED
docs/_static/logo_white.png ADDED
docs/_static/vanroy20.bibtex ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @conference{vanroy2020,
2
+ author = {Van Roy, Maaike and Robberechts, Pieter and Decroos, Tom and Davis, Jesse},
3
+ title = {Valuing On-the-Ball Actions in Soccer: A Critical Comparison of xT and VAEP},
4
+ series = {AITS},
5
+ booktitle = {Proceedings of the {AAAI}-20 Workshop on Artifical Intelligence in Team Sports},
6
+ publisher = {AI in Team Sports Organising Committee},
7
+ month = {dec},
8
+ year = {2020},
9
+ abstract = {Objectively quantifying a soccer player's contributions within a match is a challenging and crucial task in soccer analytics. Many of the currently available metrics focus on measuring the quality of shots and assists only, although these represent less than 1% of all on-the-ball actions. Most recently, several approaches were proposed to bridge this gap. By valuing how actions increase or decrease the likelihood of yielding a goal, these models are effective tools for quantifying the performances of players for all sorts of actions. However, we lack an understanding of their differences, both conceptually and in practice. Therefore, this paper critically compares two such models: expected threat (xT) and valuing actions by estimating probabilities (VAEP). Both approaches exhibit variety in their design choices, that leads to different top player rankings and major differences in how they value specific actions.},
10
+ keywords = {soccer},
11
+ }
docs/_templates/class.rst ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ fullname | escape | underline}}
2
+
3
+ .. currentmodule:: {{ module }}
4
+
5
+ .. autoclass:: {{ objname }}
6
+
7
+ {% block attributes %}
8
+ {% if attributes %}
9
+ .. rubric:: Attributes
10
+
11
+ .. autosummary::
12
+ :nosignatures:
13
+
14
+ {% for item in attributes %}
15
+ ~{{ name }}.{{ item }}
16
+ {%- endfor %}
17
+
18
+ {% endif %}
19
+ {% endblock %}
20
+
21
+ {% block methods %}
22
+ {% if methods %}
23
+ .. rubric:: Methods
24
+
25
+ .. autosummary::
26
+ :nosignatures:
27
+ :toctree: methods
28
+
29
+ {% for item in methods %}
30
+ {%- if item not in inherited_members %}
31
+ ~{{ name }}.{{ item }}
32
+ {%- endif %}
33
+ {%- endfor %}
34
+ {% endif %}
35
+
36
+ {%- if members and '__call__' in members %}
37
+ ~{{ name }}.__call__
38
+ {%- endif %}
39
+
40
+ {% endblock %}
docs/_templates/module.rst ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. empty
2
+
3
+ {{ fullname | escape | underline }}
4
+
5
+ .. currentmodule:: {{ fullname }}
6
+
7
+ .. automodule:: {{ fullname }}
8
+
9
+ {% block classes %}
10
+
11
+ {% for item in classes %}
12
+ .. autoclass:: {{ item }}
13
+ :members:
14
+ :member-order: bysource
15
+ :show-inheritance:
16
+ :exclude-members:
17
+ {%- endfor %}
18
+
19
+ {% endblock %}
20
+
21
+ {% block functions %}
22
+
23
+ {% for item in functions %}
24
+ .. autofunction:: {{ item }}
25
+ {%- endfor %}
26
+
27
+ {% endblock %}
docs/_templates/schema.rst ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ fullname | escape | underline}}
2
+
3
+ .. currentmodule:: {{ module }}
4
+
5
+ .. autoclass:: {{ objname }}
6
+
7
+ {% block attributes %}
8
+ {% if attributes %}
9
+ .. rubric:: Attributes
10
+
11
+ .. autosummary::
12
+ :nosignatures:
13
+
14
+ {% for item in attributes %}
15
+ ~{{ name }}.{{ item }}
16
+ {%- endfor %}
17
+
18
+ {% endif %}
19
+ {% endblock %}
docs/actions_bra-bel.png ADDED