import pandas as pd import numpy as np from gradio_client import Client from tqdm.auto import tqdm import os import re from translate import translate_pa_outcome, translate_pitch_outcome, jp_pitch_to_en_pitch, jp_pitch_to_pitch_code, translate_pitch_outcome, max_pitch_types # load game data game_df = pd.read_csv('game.csv').drop_duplicates() assert len(game_df) == len(game_df['game_pk'].unique()) # load pa data pa_df = [] for game_pk in tqdm(game_df['game_pk']): pa_df.append(pd.read_csv(os.path.join('pa', f'{game_pk}.csv'), dtype={'pa_pk': str})) pa_df = pd.concat(pa_df, axis='rows') # load pitch data pitch_df = [] for game_pk in tqdm(game_df['game_pk']): pitch_df.append(pd.read_csv(os.path.join('pitch', f'{game_pk}.csv'), dtype={'pa_pk': str})) pitch_df = pd.concat(pitch_df, axis='rows') pitch_df # load player data player_df = pd.read_csv('player.csv') player_df # translate pa data pa_df['_des'] = pa_df['des'].str.strip() pa_df['des'] = pa_df['des'].str.strip() pa_df['des_more'] = pa_df['des_more'].str.strip() pa_df.loc[pa_df['des'].isna(), 'des'] = pa_df[pa_df['des'].isna()]['des_more'] pa_df.loc[:, 'des'] = pa_df['des'].apply(lambda item: item.split()[0] if (len(item.split()) > 1 and re.search(r'+\d+点', item)) else item) non_home_plate_outcome = (pa_df['des'].isin(['ボール', '見逃し', '空振り'])) | (pa_df['des'].str.endswith('塁けん制')) pa_df.loc[non_home_plate_outcome, 'des'] = pa_df.loc[non_home_plate_outcome, 'des_more'] pa_df['des'] = pa_df['des'].apply(translate_pa_outcome) # translate pitch data pitch_df = pitch_df[~pitch_df['pitch_name'].isna()] pitch_df['jp_pitch_name'] = pitch_df['pitch_name'] pitch_df['pitch_name'] = pitch_df['jp_pitch_name'].apply(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name]) pitch_df['pitch_type'] = pitch_df['jp_pitch_name'].apply(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name]) pitch_df['description'] = pitch_df['description'].apply(lambda item: item.split()[0] if len(item.split()) > 1 else item) pitch_df['description'] = pitch_df['description'].apply(translate_pitch_outcome) pitch_df['release_speed'] = pitch_df['release_speed'].replace('-', np.nan) pitch_df.loc[~pitch_df['release_speed'].isna(), 'release_speed'] = pitch_df.loc[~pitch_df['release_speed'].isna(), 'release_speed'].str.removesuffix('km/h').astype(int) pitch_df['plate_x'] = (pitch_df['plate_x'] + 13) - 80 pitch_df['plate_z'] = 200 - (pitch_df['plate_z'] + 13) - 100 # translate player data client = Client("Ramos-Ramos/npb_name_translator") en_names = client.predict( jp_names='\n'.join(player_df.name.tolist()), api_name="/predict" ) player_df['jp_name'] = player_df['name'] player_df['name'] = [name if name != 'nan' else np.nan for name in en_names.splitlines()] # merge pitch and pa data df = pd.merge(pitch_df, pa_df, 'inner', on=['game_pk', 'pa_pk']) df = pd.merge(df, player_df.rename(columns={'player_id': 'pitcher'}), 'inner', on='pitcher') df['whiff'] = df['description'].isin(['SS', 'K']) df['swing'] = ~df['description'].isin(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference']) df['csw'] = df['description'].isin(['SS', 'K', 'LS', 'inv_K']) df['normal_pitch'] = ~df['description'].isin(['obstruction', 'illegal_pitch', 'defensive_interference']) # guess whiff_rate = df.groupby(['name', 'pitch_name']) whiff_rate = (whiff_rate['whiff'].sum() / whiff_rate['swing'].sum() * 100).round(1).rename('Whiff%').reset_index() csw_rate = df.groupby(['name', 'pitch_name']) csw_rate = (csw_rate['csw'].sum() / csw_rate['normal_pitch'].sum() * 100).round(1).rename('CSW%').reset_index() pitch_stats = pd.merge( whiff_rate, csw_rate, on=['name', 'pitch_name'] ).set_index(['name', 'pitch_name'])