npb_data_viz_demo

Running

File size: 8,032 Bytes


# import pandas as pd
import polars as pl
import numpy as np
from gradio_client import Client
from tqdm.auto import tqdm

import os
import re

from translate import translate_pa_outcome, translate_pitch_outcome, jp_pitch_to_en_pitch, jp_pitch_to_pitch_code, translate_pitch_outcome, max_pitch_types

# load game data
# game_df = pd.read_csv('game.csv').drop_duplicates()
game_df = pl.read_csv('game.csv').unique()
assert len(game_df) == len(game_df['game_pk'].unique())

# load pa data
pa_df = []
# for game_pk in tqdm(game_df['game_pk']):
#   pa_df.append(pd.read_csv(os.path.join('pa', f'{game_pk}.csv'), dtype={'pa_pk': str}))
# pa_df = pd.concat(pa_df, axis='rows')
for game_pk in tqdm(game_df['game_pk']):
  pa_df.append(pl.read_csv(os.path.join('pa', f'{game_pk}.csv'), schema_overrides={'pa_pk': str}))
pa_df = pl.concat(pa_df)

# load pitch data
pitch_df = []
# for game_pk in tqdm(game_df['game_pk']):
#   pitch_df.append(pd.read_csv(os.path.join('pitch', f'{game_pk}.csv'), dtype={'pa_pk': str}))
# pitch_df = pd.concat(pitch_df, axis='rows')
for game_pk in tqdm(game_df['game_pk']):
  pitch_df.append(pl.read_csv(os.path.join('pitch', f'{game_pk}.csv'), schema_overrides={'pa_pk': str, 'on_1b': pl.Int64, 'on_2b': pl.Int64, 'on_3b': pl.Int64}))
pitch_df = pl.concat(pitch_df)

# load player data
player_df = pl.read_csv('player.csv')

# translate pa data
# pa_df['_des'] = pa_df['des'].str.strip()
# pa_df['des'] = pa_df['des'].str.strip()
# pa_df['des_more'] = pa_df['des_more'].str.strip()
# pa_df.loc[pa_df['des'].isna(), 'des'] = pa_df[pa_df['des'].isna()]['des_more']
# pa_df.loc[:, 'des'] = pa_df['des'].apply(lambda item: item.split()[0] if (len(item.split()) > 1 and re.search(r'＋\d+点', item)) else item)
# non_home_plate_outcome = (pa_df['des'].isin(['ボール', '見逃し', '空振り'])) | (pa_df['des'].str.endswith('塁けん制'))
# pa_df.loc[non_home_plate_outcome, 'des'] = pa_df.loc[non_home_plate_outcome, 'des_more']
# pa_df['des'] = pa_df['des'].apply(translate_pa_outcome)
pa_df = (
    pa_df
    .with_columns(
        pl.col('des').str.strip_chars().alias('_des'),
        pl.col('des').str.strip_chars(),
        pl.col('des_more').str.strip_chars()
    )
    .with_columns(
        pl.col('des').fill_null(pl.col('des_more'))
    )
    .with_columns(
        pl.when(
            (pl.col('des').str.split(' ').list.len() > 1) &
            (pl.col('des').str.contains(r'＋\d+点'))
        )
        .then(pl.col('des').str.split(' ').list.first())
        .otherwise(pl.col('des'))
        .alias('des')
    )
    .with_columns(
        pl.when(
            pl.col('des').is_in(['ボール', '見逃し', '空振り']) |
            pl.col('des').str.ends_with('塁けん制')
        )
        .then(
            pl.col('des_more')
        )
        .otherwise(
            pl.col('des')
        )
        .alias('des')
    )
    .with_columns(
        pl.col('des').map_elements(translate_pa_outcome, return_dtype=str)
    )
)

# translate pitch data
# pitch_df = pitch_df[~pitch_df['pitch_name'].isna()]
# pitch_df['jp_pitch_name'] = pitch_df['pitch_name']
# pitch_df['pitch_name'] = pitch_df['jp_pitch_name'].apply(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name])
# pitch_df['pitch_type'] = pitch_df['jp_pitch_name'].apply(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name])
# pitch_df['description'] = pitch_df['description'].apply(lambda item: item.split()[0] if len(item.split()) > 1 else item)
# pitch_df['description'] = pitch_df['description'].apply(translate_pitch_outcome)
# pitch_df['release_speed'] = pitch_df['release_speed'].replace('-', np.nan)
# pitch_df.loc[~pitch_df['release_speed'].isna(), 'release_speed'] = pitch_df.loc[~pitch_df['release_speed'].isna(), 'release_speed'].str.removesuffix('km/h').astype(int)
# pitch_df['plate_x'] = (pitch_df['plate_x'] + 13) - 80
# pitch_df['plate_z'] = 200 - (pitch_df['plate_z'] + 13) - 100
pitch_df = (
    pitch_df
    .filter(pl.col('pitch_name').is_not_null())
    .with_columns(
        pl.col('pitch_name').alias('jp_pitch_name')
    )
    .with_columns(
        pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name], return_dtype=str).alias('pitch_name'),
        pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name], return_dtype=str).alias('pitch_type'),
        pl.col('description').str.split(' ').list.first().map_elements(translate_pitch_outcome, return_dtype=str),
        pl.when(
            pl.col('release_speed') != '-'
        )
        .then(
            pl.col('release_speed').str.strip_suffix('km/h')
        )
        .otherwise(
            None
        )
        .alias('release_speed'),
        ((pl.col('plate_x') + 13) - 80).alias('plate_x'),
        (200 - (pl.col('plate_z') + 13) - 100).alias('plate_z'),
    )
    .with_columns(
        pl.col('release_speed').cast(int), # idk why I can't do this during the strip_suffix step
    )
)

# translate player data
client = Client("Ramos-Ramos/npb_name_translator")
# en_names = client.predict(
# 		jp_names='\n'.join(player_df.name.tolist()),
# 		api_name="/predict"
# )
# player_df['jp_name'] = player_df['name']
# player_df['name'] = [name if name != 'nan' else np.nan for name in en_names.splitlines()]
en_names = client.predict(
		jp_names='\n'.join(player_df['name'].to_list()),
		api_name="/predict"
)
player_df = (
    player_df
    .with_columns(
        pl.col('name').alias('jp_name'),
        pl.Series('name', en_names.splitlines())
    )
    .with_columns(
        pl.when(pl.col('name') == 'nan')
        .then(None)
        .otherwise(pl.col('name'))
        .alias('name')
    )
)

# # merge pitch and pa data
# df = pd.merge(pitch_df, pa_df, 'inner', on=['game_pk', 'pa_pk'])
# df = pd.merge(df, player_df.rename(columns={'player_id': 'pitcher'}), 'inner', on='pitcher')
# df['whiff'] = df['description'].isin(['SS', 'K'])
# df['swing'] = ~df['description'].isin(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference'])
# df['csw'] = df['description'].isin(['SS', 'K', 'LS', 'inv_K'])
# df['normal_pitch'] = ~df['description'].isin(['obstruction', 'illegal_pitch', 'defensive_interference']) # guess

df = (
    (
        pitch_df
        .join(pa_df, on=['game_pk', 'pa_pk'], how='inner')
        .join(player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
    )
    .with_columns(
        pl.col('description').is_in(['SS', 'K']).alias('whiff'),
        ~pl.col('description').is_in(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference']).alias('swing'),
        pl.col('description').is_in(['SS', 'K', 'LS', 'inv_K']).alias('csw'),
        ~pl.col('description').is_in(['obstruction', 'illegal_pitch', 'defensive_interference']).alias('normal_pitch') # guess
    )
)

# df_by_player_pitch = df.groupby(['name', 'pitch_name'])
# whiff_rate = (df_by_player_pitch['whiff'].sum() / df_by_player_pitch['swing'].sum() * 100).round(1).rename('Whiff%')
# csw_rate = (df_by_player_pitch['csw'].sum() / df_by_player_pitch['normal_pitch'].sum() * 100).round(1).rename('CSW%')
# velo = df_by_player_pitch['release_speed'].apply(lambda x: round(x.mean(), 1)).rename('Velocity')

# pitch_stats = pd.concat([whiff_rate, csw_rate, velo], axis=1)
# league_pitch_stats = pd.DataFrame(df.groupby('pitch_name')['release_speed'].apply(lambda x: round(x.mean(), 1)).rename('Velocity'))

pitch_stats= (
    df
    .group_by(['name', 'pitch_name'])
    .agg(
        ((pl.col('whiff').sum() / pl.col('swing').sum()) * 100).round(1).alias('Whiff%'),
        ((pl.col('csw').sum() / pl.col('normal_pitch').sum()) * 100).round(1).alias('CSW%'),
        pl.col('release_speed').mean().round(1).alias('Velocity'),
        pl.len().alias('Count')
    )
    .sort(['name', 'Count'], descending=[False, True])
)
league_pitch_stats = df.group_by('pitch_name').agg(pl.col('release_speed').mean().round(1).alias('Velocity'))