|
import pandas as pd |
|
from datetime import datetime |
|
import os |
|
import numpy as np |
|
from urllib.parse import quote, unquote |
|
|
|
class ChampionConverter: |
|
def __init__(self): |
|
self.champions = [ |
|
"Aatrox", "Ahri", "Akali", "Akshan", "Alistar", "Ambessa", "Amumu", "Anivia", "Annie", "Aphelios", "Ashe", "Aurelion Sol", |
|
"Aurora", "Azir", "Bard", "Bel'Veth", "Blitzcrank", "Brand", "Braum", "Briar", "Caitlyn", "Camille", "Cassiopeia", "Cho'Gath", |
|
"Corki", "Darius", "Diana", "Dr. Mundo", "Draven", "Ekko", "Elise", "Evelynn", "Ezreal", "Fiddlesticks", "Fiora", "Fizz", "Galio", |
|
"Gangplank", "Garen", "Gnar", "Gragas", "Graves", "Gwen", "Hecarim", "Heimerdinger", "Hwei", "Illaoi", "Irelia", "Ivern", "Janna", |
|
"Jarvan IV", "Jax", "Jayce", "Jhin", "Jinx", "K'Sante", "Kai'Sa", "Kalista", "Karma", "Karthus", "Kassadin", "Katarina", "Kayle", |
|
"Kayn", "Kennen", "Kha'Zix", "Kindred", "Kled", "Kog'Maw", "LeBlanc", "Lee Sin", "Leona", "Lillia", "Lissandra", "Lucian", "Lulu", |
|
"Lux", "Malphite", "Malzahar", "Maokai", "Master Yi", "Milio", "Miss Fortune", "Mordekaiser", "Morgana", "Naafiri", "Nami", "Nasus", |
|
"Nautilus", "Neeko", "Nidalee", "Nilah", "Nocturne", "Nunu & Willump", "Olaf", "Orianna", "Ornn", "Pantheon", "Poppy", "Pyke", |
|
"Qiyana", "Quinn", "Rakan", "Rammus", "Rek'Sai", "Rell", "Renata Glasc", "Renekton", "Rengar", "Riven", "Rumble", "Ryze", "Samira", |
|
"Sejuani", "Senna", "Seraphine", "Sett", "Shaco", "Shen", "Shyvana", "Singed", "Sion", "Sivir", "Skarner", "Smolder", "Sona", |
|
"Soraka", "Swain", "Sylas", "Syndra", "Tahm Kench", "Taliyah", "Talon", "Taric", "Teemo", "Thresh", "Tristana", "Trundle", |
|
"Tryndamere", "Twisted Fate", "Twitch", "Udyr", "Urgot", "Varus", "Vayne", "Veigar", "Vel'Koz", "Vex", "Vi", "Viego", "Viktor", |
|
"Vladimir", "Volibear", "Warwick", "Wukong", "Xayah", "Xerath", "Xin Zhao", "Yasuo", "Yone", "Yorick", "Yuumi", "Zac", "Zed", |
|
"Zeri", "Ziggs", "Zilean", "Zoe", "Zyra" |
|
] |
|
|
|
self.champion_to_number = {champion: i for i, champion in enumerate(self.champions, start=1)} |
|
self.number_to_champion = {i: champion for i, champion in enumerate(self.champions, start=1)} |
|
|
|
def champion_to_num(self, champion_name): |
|
return self.champion_to_number.get(champion_name, None) |
|
|
|
def num_to_champion(self, number): |
|
return self.number_to_champion.get(number, None) |
|
|
|
def convert_date(date_str): |
|
"""Convert datetime string to Unix timestamp""" |
|
try: |
|
if pd.isna(date_str): |
|
return None |
|
return pd.to_datetime(date_str).timestamp() |
|
except: |
|
return None |
|
|
|
|
|
def convert_to_minutes(time_str): |
|
"""Convert time string (e.g., '15m 10s') to minutes (float)""" |
|
try: |
|
minutes = seconds = 0 |
|
parts = time_str.lower().split() |
|
for part in parts: |
|
if 'm' in part: |
|
minutes = float(part.replace('m', '')) |
|
elif 's' in part: |
|
seconds = float(part.replace('s', '')) |
|
return round(minutes + seconds/60, 2) |
|
except: |
|
return 0.0 |
|
|
|
def convert_percentage_to_decimal(percentage_str): |
|
"""Convert percentage string (e.g., 'P/Kill 43%') to decimal (0.43)""" |
|
try: |
|
|
|
num = float(''.join(filter(str.isdigit, percentage_str))) / 100 |
|
return round(num, 2) |
|
except: |
|
return 0.0 |
|
|
|
def convert_tier_to_number(tier_str): |
|
""" |
|
Convert tier string to number: |
|
Challenger -> 1 |
|
Grandmaster -> 2 |
|
Master -> 3 |
|
Others -> 4 |
|
""" |
|
tier_map = { |
|
'challenger': 1, |
|
'grandmaster': 2, |
|
'master': 3 |
|
} |
|
|
|
return tier_map.get(tier_str.lower().strip(), 4) |
|
|
|
def convert_result_to_binary(result_str): |
|
""" |
|
Convert match result to binary: |
|
Victory -> 1 |
|
Defeat -> 0 |
|
""" |
|
return 1 if result_str.lower().strip() == 'victory' else 0 |
|
|
|
def merge_stats(recent_stats, player_stats, current_time =None): |
|
""" |
|
Merge recent match stats with player profile stats and save to CSV. |
|
Only keeps rows where matches exist in both DataFrames. |
|
|
|
Args: |
|
recent_stats (DataFrame/dict): Recent match statistics |
|
player_stats (DataFrame/tuple): Player profile statistics |
|
|
|
Returns: |
|
DataFrame: Combined statistics |
|
""" |
|
try: |
|
if current_time is None: |
|
current_time = datetime.utcnow().strftime("%Y-%m-%d") |
|
|
|
|
|
if not isinstance(recent_stats, pd.DataFrame): |
|
recent_df = pd.DataFrame(recent_stats) |
|
else: |
|
recent_df = recent_stats |
|
|
|
|
|
if isinstance(player_stats, tuple): |
|
|
|
player_df = player_stats[0] |
|
elif isinstance(player_stats, pd.DataFrame): |
|
player_df = player_stats |
|
else: |
|
raise ValueError("Invalid player_stats format") |
|
|
|
|
|
if 'player_id' not in recent_df.columns: |
|
recent_df['player_id'] = player_df['player_id'].iloc[0] |
|
|
|
|
|
print(f"\nBefore merge:") |
|
print(f"Recent stats rows: {len(recent_df)}") |
|
print(f"Player stats rows: {len(player_df)}") |
|
print(f"Unique players in recent stats: {recent_df['player_id'].nunique()}") |
|
print(f"Unique players in player stats: {player_df['player_id'].nunique()}") |
|
|
|
|
|
merged_df = pd.merge( |
|
recent_df, |
|
player_df, |
|
on='player_id', |
|
how='inner', |
|
suffixes=('', '_profile') |
|
) |
|
|
|
|
|
print(f"\nAfter merge:") |
|
print(f"Merged stats rows: {len(merged_df)}") |
|
print(f"Unique players in merged stats: {merged_df['player_id'].nunique()}") |
|
|
|
|
|
cols = merged_df.columns.tolist() |
|
cols = ['player_id'] + [col for col in cols if col != 'player_id'] |
|
if 'region' in cols: |
|
cols.remove('region') |
|
cols.insert(1, 'region') |
|
merged_df = merged_df[cols] |
|
|
|
|
|
save_dir = "util/data" |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
filename = f"player_stats_merged_{current_time}.csv" |
|
filepath = os.path.join(save_dir, filename) |
|
merged_df.to_csv(filepath, index=False) |
|
print(f"\nSuccessfully saved merged stats to {filepath}") |
|
|
|
return merged_df |
|
|
|
except Exception as e: |
|
print(f"Error in merge_stats: {e}") |
|
return None |
|
|
|
|
|
def filter_leaderboard(df, tiers=None): |
|
""" |
|
Filter leaderboard DataFrame to keep only specific tiers. |
|
|
|
Args: |
|
df (pandas.DataFrame): Input leaderboard DataFrame |
|
tiers (list): List of tiers to keep. Defaults to ["CHALLENGER", "GRANDMASTER"] |
|
timestamp (str): Current timestamp in UTC |
|
scraper_user (str): Current user's login |
|
|
|
Returns: |
|
pandas.DataFrame: Filtered leaderboard data |
|
""" |
|
try: |
|
|
|
if tiers is None: |
|
tiers = ["CHALLENGER", "GRANDMASTER"] |
|
|
|
|
|
tiers = [tier.upper() for tier in tiers] |
|
|
|
|
|
required_cols = ["tier", "summoner", "region"] |
|
if not all(col in df.columns for col in required_cols): |
|
raise ValueError(f"DataFrame must contain columns: {required_cols}") |
|
|
|
|
|
filtered_df = df.copy() |
|
|
|
|
|
filtered_df['tier'] = filtered_df['tier'].str.upper() |
|
|
|
|
|
filtered_df = filtered_df[filtered_df['tier'].isin(tiers)] |
|
|
|
|
|
|
|
filtered_df = filtered_df.sort_values(['region', 'tier', 'rank']) |
|
|
|
|
|
filtered_df = filtered_df.reset_index(drop=True) |
|
|
|
|
|
output_file = os.path.join("util", "data", "lb_filtered.csv") |
|
os.makedirs(os.path.dirname(output_file), exist_ok=True) |
|
filtered_df.to_csv(output_file, index=False) |
|
|
|
print(f"\nFiltered leaderboard to {len(tiers)} tiers: {', '.join(tiers)}") |
|
print(f"Remaining entries: {len(filtered_df)}") |
|
print(f"Saved filtered leaderboard to {output_file}") |
|
|
|
|
|
print("\nSummary by region and tier:") |
|
summary = filtered_df.groupby(['region', 'tier']).size().unstack(fill_value=0) |
|
print(summary) |
|
|
|
return filtered_df |
|
|
|
except Exception as e: |
|
print(f"Error filtering leaderboard: {e}") |
|
return None |
|
|
|
def format_summoner_name(summoner): |
|
""" |
|
Format summoner name for URL usage |
|
|
|
Parameters: |
|
summoner: str - Original summoner name |
|
|
|
Returns: |
|
str - Formatted summoner name |
|
""" |
|
if not summoner: |
|
raise ValueError("Summoner name cannot be empty") |
|
|
|
|
|
summoner = summoner.strip() |
|
|
|
|
|
formatted_summoner = summoner.replace(" ", "-").replace("#", "-") |
|
|
|
|
|
formatted_summoner = quote(formatted_summoner) |
|
|
|
return formatted_summoner |
|
|
|
def convert_to_displayname(name): |
|
""" |
|
Convert a summoner name to display format |
|
Examples: |
|
marthinsurya-NA -> marthinsurya #NA |
|
toplane%20kid-EUW77 -> toplane kid #EUW77 |
|
Walid-Georgey-EUW -> Walid Georgey #EUW |
|
Current%20User-KR -> Current User #KR |
|
""" |
|
try: |
|
if not name: |
|
return "" |
|
|
|
|
|
decoded = unquote(name) |
|
|
|
|
|
decoded = decoded.rstrip('-') |
|
|
|
|
|
if '-' in decoded: |
|
parts = decoded.rsplit('-', 1) |
|
base_name = parts[0] |
|
region = parts[1] |
|
|
|
|
|
base_name = base_name.replace('-', ' ') |
|
|
|
|
|
base_name = ' '.join(filter(None, base_name.split())) |
|
|
|
return f"{base_name} #{region}" |
|
|
|
return decoded.replace('-', ' ') |
|
except Exception as e: |
|
print(f"Error converting name '{name}': {e}") |
|
return name |
|
|
|
|
|
|
|
def get_player_list(leaderboard=None): |
|
""" |
|
Convert leaderboard data into proper player list format for API calls. |
|
|
|
Args: |
|
leaderboard (DataFrame): Input leaderboard DataFrame containing summoner and region |
|
|
|
Returns: |
|
DataFrame: Formatted player list with region and username columns |
|
""" |
|
try: |
|
|
|
if leaderboard is None: |
|
leaderboard_file = os.path.join("util", "data", "lb_filtered.csv") |
|
leaderboard = pd.read_csv(leaderboard_file) |
|
|
|
|
|
leaderboard = leaderboard.rename(columns={'summoner': 'username'}) |
|
|
|
|
|
player_list = leaderboard[['region', 'username']] |
|
|
|
print(f"Successfully processed {len(player_list)} players") |
|
return player_list |
|
|
|
except Exception as e: |
|
print(f"Error processing leaderboard: {e}") |
|
return None |
|
|
|
def process_kda_perfect(df): |
|
""" |
|
Process KDA values in the DataFrame, replacing 'Perfect' with appropriate values. |
|
""" |
|
try: |
|
|
|
df = df.copy() |
|
|
|
|
|
def safe_convert(x): |
|
if isinstance(x, (int, float)): |
|
return x |
|
if isinstance(x, str) and x.lower() == 'perfect': |
|
return 6 |
|
try: |
|
return float(x) |
|
except: |
|
return None |
|
|
|
|
|
for col in ['KDA_1', 'KDA_2', 'KDA_3']: |
|
if col in df.columns: |
|
df[col] = df[col].apply(safe_convert) |
|
|
|
|
|
for i in range(1, 8): |
|
col = f'kda_ssn_{i}' |
|
if col in df.columns: |
|
perfect_mask = df[col].astype(str).str.contains('perfect', case=False) |
|
if perfect_mask.any(): |
|
kills_col, assists_col = f'k_ssn_{i}', f'a_ssn_{i}' |
|
if kills_col in df.columns and assists_col in df.columns: |
|
df.loc[perfect_mask, col] = df.loc[perfect_mask].apply( |
|
lambda row: pd.to_numeric(row[kills_col], errors='coerce') + |
|
pd.to_numeric(row[assists_col], errors='coerce'), |
|
axis=1 |
|
) |
|
else: |
|
df.loc[perfect_mask, col] = 6 |
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
|
|
|
if 'kda_ratio_profile' in df.columns: |
|
perfect_mask = df['kda_ratio_profile'].astype(str).str.contains('perfect', case=False) |
|
if perfect_mask.any(): |
|
df.loc[perfect_mask, 'kda_ratio_profile'] = df.loc[perfect_mask].apply( |
|
lambda row: pd.to_numeric(row['avg_kills'], errors='coerce') + |
|
pd.to_numeric(row['avg_assists'], errors='coerce'), |
|
axis=1 |
|
) |
|
df['kda_ratio_profile'] = pd.to_numeric(df['kda_ratio_profile'], errors='coerce') |
|
|
|
|
|
other_cols = [col for col in df.columns if 'kda_ratio' in col.lower() |
|
and col != 'kda_ratio_profile' |
|
and col not in [f'kda_ssn_{i}' for i in range(1, 8)]] |
|
|
|
for col in other_cols: |
|
perfect_mask = df[col].astype(str).str.contains('perfect', case=False) |
|
if perfect_mask.any(): |
|
prefix = col.split('kda_ratio')[0] |
|
kills_col, assists_col = f"{prefix}kills", f"{prefix}assists" |
|
if kills_col in df.columns and assists_col in df.columns: |
|
df.loc[perfect_mask, col] = df.loc[perfect_mask].apply( |
|
lambda row: pd.to_numeric(row[kills_col], errors='coerce') + |
|
pd.to_numeric(row[assists_col], errors='coerce'), |
|
axis=1 |
|
) |
|
else: |
|
df.loc[perfect_mask, col] = 6 |
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
|
return df |
|
|
|
except Exception as e: |
|
print(f"Error in process_kda_perfect: {str(e)}") |
|
return df |
|
|
|
|
|
def check_mixed_types(df): |
|
""" |
|
Check and print dataframe column types, inconsistencies, and basic statistics |
|
""" |
|
|
|
dtype_info = pd.DataFrame({ |
|
'dtype': df.dtypes, |
|
'non_null': df.count(), |
|
'null_count': df.isnull().sum(), |
|
'unique_values': [df[col].nunique() for col in df.columns] |
|
}) |
|
|
|
|
|
dtype_info['sample_values'] = [df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist() |
|
if len(df[col].dropna()) > 0 else [] |
|
for col in df.columns] |
|
|
|
|
|
mixed_type_cols = [] |
|
for col in df.select_dtypes(include=['object']): |
|
types = df[col].apply(type).unique() |
|
if len(types) > 1: |
|
mixed_type_cols.append({ |
|
'column': col, |
|
'types': [t.__name__ for t in types], |
|
'samples': df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist() |
|
}) |
|
|
|
print("=== DataFrame Overview ===") |
|
print(f"Shape: {df.shape}") |
|
print("\n=== Data Types Summary ===") |
|
print(df.dtypes.value_counts()) |
|
|
|
if mixed_type_cols: |
|
print("\n=== Mixed Type Columns ===") |
|
for col_info in mixed_type_cols: |
|
print(f"\nColumn: {col_info['column']}") |
|
print(f"Types found: {col_info['types']}") |
|
print(f"Sample values: {col_info['samples']}") |
|
|
|
return dtype_info |
|
|
|
def check_nan_float(df, column_name): |
|
float_mask = df[column_name].apply(lambda x: isinstance(x, float)) |
|
is_nan_mask = df[column_name].isna() |
|
|
|
|
|
all_floats_are_nan = (float_mask == is_nan_mask).all() |
|
print(f"Are all float values NaN? {all_floats_are_nan}") |
|
|
|
|
|
print(f"Number of float values: {float_mask.sum()}") |
|
print(f"Number of NaN values: {is_nan_mask.sum()}") |
|
|
|
def convert_team_colors(df): |
|
""" |
|
Convert 'team' column values from 'blue'/'red' to 1/2 |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame with 'team' column |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with converted team values |
|
""" |
|
df = df.copy() |
|
|
|
if 'team' not in df.columns: |
|
raise ValueError("Column 'team' not found in DataFrame") |
|
|
|
|
|
team_mapping = { |
|
'blue': 1, |
|
'red': 2 |
|
} |
|
|
|
|
|
df['team'] = df['team'].map(team_mapping, na_action='ignore') |
|
|
|
return df |
|
|
|
def convert_region(df): |
|
""" |
|
Convert 'region' column values to numeric: |
|
kr -> 1 |
|
euw -> 2 |
|
vn -> 3 |
|
na -> 4 |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame with 'region' column |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with converted region values |
|
""" |
|
df = df.copy() |
|
|
|
if 'region' not in df.columns: |
|
raise ValueError("Column 'region' not found in DataFrame") |
|
|
|
|
|
region_mapping = { |
|
'kr': 1, |
|
'euw': 2, |
|
'vn': 3, |
|
'na': 4 |
|
} |
|
|
|
|
|
df['region'] = df['region'].map(region_mapping, na_action='ignore') |
|
|
|
return df |
|
|
|
def convert_champion_columns(df): |
|
""" |
|
Convert all champion-related columns to numbers using ChampionConverter |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with converted champion values |
|
""" |
|
df = df.copy() |
|
|
|
|
|
converter = ChampionConverter() |
|
|
|
|
|
champion_columns = [col for col in df.columns if 'champ' in col.lower()] |
|
|
|
for col in champion_columns: |
|
|
|
df[col] = df[col].map(converter.champion_to_num, na_action='ignore') |
|
|
|
return df |
|
|
|
def convert_date_column(df): |
|
""" |
|
Convert date column from string format to Unix timestamp |
|
Handles missing values (NaT, None, NaN) |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame with 'date' column |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with converted date values |
|
""" |
|
df = df.copy() |
|
|
|
if 'date' not in df.columns: |
|
raise ValueError("Column 'date' not found in DataFrame") |
|
|
|
|
|
df['date'] = df['date'].apply(convert_date) |
|
|
|
return df |
|
|
|
def convert_role_columns(df): |
|
""" |
|
Convert role columns to numbers: |
|
TOP -> 1, MID -> 2, ADC -> 3, JUNGLE -> 4, SUPPORT -> 5 |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with converted role values |
|
""" |
|
df = df.copy() |
|
|
|
|
|
role_mapping = { |
|
'TOP': 1, |
|
'MID': 2, |
|
'ADC': 3, |
|
'JUNGLE': 4, |
|
'SUPPORT': 5 |
|
} |
|
|
|
|
|
role_columns = ['most_role_1', 'most_role_2'] |
|
|
|
|
|
for col in role_columns: |
|
if col in df.columns: |
|
|
|
df[col] = df[col].map(role_mapping, na_action='ignore') |
|
|
|
else: |
|
print(f"Warning: Column {col} not found in DataFrame") |
|
|
|
return df |
|
|
|
def convert_id_columns(df): |
|
""" |
|
Drop ID-related columns (player_id, teammates1-4, oppmates1-5) |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with ID columns dropped |
|
""" |
|
df = df.copy() |
|
|
|
|
|
id_columns = ( |
|
['player_id', 'region_profile'] + |
|
[f'teammates{i}' for i in range(1, 5)] + |
|
[f'oppmates{i}' for i in range(1, 6)] |
|
) |
|
|
|
|
|
|
|
existing_columns = [col for col in id_columns if col in df.columns] |
|
if len(existing_columns) != len(id_columns): |
|
missing = set(id_columns) - set(existing_columns) |
|
print(f"Note: Some columns were not found in DataFrame: {missing}") |
|
|
|
|
|
df = df.drop(columns=existing_columns) |
|
|
|
return df |
|
|
|
def remove_match_stats(df): |
|
""" |
|
Remove match-specific statistics to prevent future data leakage. |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input DataFrame |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with match-specific columns removed |
|
""" |
|
|
|
match_stat_columns = [ |
|
'level', |
|
'result', |
|
'match_length_mins', |
|
'kill', |
|
'death', |
|
'assist', |
|
'kda_ratio', |
|
'kill_participation', |
|
'laning', |
|
'cs', |
|
'cs_per_min' |
|
] |
|
|
|
|
|
df_clean = df.copy() |
|
|
|
|
|
columns_to_drop = [col for col in match_stat_columns if col in df_clean.columns] |
|
df_clean = df_clean.drop(columns=columns_to_drop) |
|
|
|
return df_clean |
|
|
|
def convert_df(df): |
|
""" |
|
Master function to handle all conversions for training DataFrame |
|
|
|
Includes: |
|
- Team color conversion (blue/red to 1/2) |
|
- Region conversion (kr/euw/vn/na to 1/2/3/4) |
|
- Champion conversion (champion names to numbers) |
|
- Date conversion (string to Unix timestamp) |
|
- Role conversion (TOP/MID/ADC/JUNGLE/SUPPORT to 1/2/3/4/5) |
|
- Drop ID columns (player_id, teammates1-4, oppmates1-5, region_profile) |
|
|
|
Parameters: |
|
df (pandas.DataFrame): Input training DataFrame |
|
|
|
Returns: |
|
pandas.DataFrame: Processed DataFrame with all conversions |
|
""" |
|
df = df.copy() |
|
|
|
|
|
initial_rows = len(df) |
|
df = df.dropna(subset=['champion']) |
|
rows_dropped = initial_rows - len(df) |
|
print(f"Dropped {rows_dropped} rows with NA champion values") |
|
|
|
|
|
conversions = [ |
|
convert_team_colors, |
|
convert_region, |
|
convert_champion_columns, |
|
convert_date_column, |
|
convert_role_columns, |
|
convert_id_columns, |
|
remove_match_stats |
|
] |
|
|
|
|
|
for convert_func in conversions: |
|
try: |
|
print(f"Applying {convert_func.__name__}...") |
|
df = convert_func(df) |
|
except Exception as e: |
|
print(f"Error in {convert_func.__name__}: {str(e)}") |
|
raise |
|
|
|
return df |
|
|
|
|
|
def get_top_champion_scores(df, n=5): |
|
""" |
|
Get top n champion scores from a DataFrame |
|
|
|
Parameters: |
|
df: pandas DataFrame containing champion scores |
|
n: number of top champions to return (default 5) |
|
|
|
Returns: |
|
pandas DataFrame with original data plus top n champion scores and their names |
|
""" |
|
try: |
|
converter = ChampionConverter() |
|
df = df.copy() |
|
|
|
|
|
champion_start = df.columns.get_loc('Aatrox') |
|
champion_end = df.columns.get_loc('Zyra') + 1 |
|
champion_cols = df.columns[champion_start:champion_end] |
|
|
|
|
|
champion_scores = df[champion_cols].apply(pd.to_numeric, errors='coerce').fillna(0) |
|
|
|
|
|
top_n_indices = champion_scores.apply(lambda x: pd.Series(x.nlargest(n).index), axis=1) |
|
top_n_values = champion_scores.apply(lambda x: pd.Series(x.nlargest(n).values), axis=1) |
|
|
|
|
|
for i in range(n): |
|
|
|
df[f'{i+1}_champ_score'] = top_n_values.iloc[:, i].astype(float) |
|
|
|
|
|
champ_names = top_n_indices.iloc[:, i] |
|
df[f'{i+1}_champ_name'] = champ_names.map( |
|
lambda x: int(converter.champion_to_num(x)) if pd.notnull(x) else -1 |
|
) |
|
|
|
return df |
|
|
|
except Exception as e: |
|
print(f"Error in get_top_champion_scores: {str(e)}") |
|
|
|
for i in range(1, n + 1): |
|
df[f'{i}_champ_score'] = 0.0 |
|
df[f'{i}_champ_name'] = -1 |
|
return df |
|
|
|
def check_datatypes(df): |
|
datatype= pd.DataFrame({ |
|
'dtype': df.dtypes, |
|
'unique_values': df.nunique() |
|
}) |
|
|
|
print(datatype) |
|
return datatype |
|
|
|
def calculate_champ_variety_score(df): |
|
df = df.copy() |
|
|
|
|
|
champ_columns = [ |
|
'most_champ_1', 'most_champ_2', 'most_champ_3', |
|
'7d_champ_1', '7d_champ_2', '7d_champ_3' |
|
] |
|
|
|
|
|
existing_columns = [col for col in champ_columns if col in df.columns] |
|
|
|
|
|
def count_unique_champions(row): |
|
|
|
valid_champions = row[existing_columns].dropna() |
|
|
|
return len(set(valid_champions)) |
|
|
|
|
|
df['champ_variety_score'] = df.apply(count_unique_champions, axis=1) |
|
|
|
return df |
|
|
|
def calculate_playstyle(df): |
|
df = df.copy() |
|
|
|
|
|
conditions = [ |
|
|
|
(df['avg_kills'] > df['avg_assists']) & |
|
(df['kda_ratio_profile'] > 3) & |
|
(df['kill_participation_profile'] > 0.6), |
|
|
|
|
|
(df['avg_assists'] > df['avg_kills']) & |
|
(df['kda_ratio_profile'] > 2.5) & |
|
(df['kill_participation_profile'] > 0.55), |
|
|
|
|
|
(df['avg_deaths'] > 3) & |
|
(df['avg_assists'] > df['avg_kills']) & |
|
(df['kill_participation_profile'] > 0.5), |
|
|
|
|
|
(df['kill_participation_profile'] < 0.5) & |
|
(df['kda_ratio_profile'] > 2), |
|
|
|
|
|
(df['avg_kills'] > 3) & |
|
(df['avg_deaths'] > 4) & |
|
(df['kill_participation_profile'] > 0.55) |
|
] |
|
|
|
values = [0, 1, 2, 3, 4] |
|
df['playstyle'] = np.select(conditions, values, default=5) |
|
|
|
return df |
|
|
|
def get_most_role_3(df): |
|
df = df.copy() |
|
|
|
|
|
role_mapping = { |
|
'TOP': 1, |
|
'MID': 2, |
|
'ADC': 3, |
|
'JUNGLE': 4, |
|
'SUPPORT': 5 |
|
} |
|
|
|
def get_third_role_info(row): |
|
|
|
role_values = { |
|
'TOP': row['TOP'], |
|
'JUNGLE': row['JUNGLE'], |
|
'MID': row['MID'], |
|
'ADC': row['ADC'], |
|
'SUPPORT': row['SUPPORT'] |
|
} |
|
|
|
|
|
role_values.pop(row['most_role_1'], None) |
|
role_values.pop(row['most_role_2'], None) |
|
|
|
|
|
if role_values: |
|
third_role, third_value = max(role_values.items(), key=lambda x: x[1]) |
|
return role_mapping[third_role], third_value |
|
return 0, 0.0 |
|
|
|
|
|
df[['most_role_3', 'most_role_3_value']] = df.apply(get_third_role_info, axis=1, result_type='expand') |
|
|
|
return df |
|
|
|
def calculate_role_specialization(df): |
|
df = df.copy() |
|
|
|
|
|
conditions = [ |
|
|
|
(df['most_role_1_value'] > 0.6), |
|
|
|
|
|
(df['most_role_1_value'] <= 0.6) & |
|
(df['most_role_2_value'] >= 0.3), |
|
|
|
|
|
(df['most_role_1_value'] <= 0.6) & |
|
(df['most_role_2_value'] < 0.3) & |
|
(df['most_role_1_value'] > 0.3) & |
|
(df['most_role_3_value'] > 0.1), |
|
|
|
|
|
(df['most_role_1_value'] <= 0.6) & |
|
(df['most_role_2_value'] < 0.3) & |
|
(df['most_role_1_value'] > 0.3) & |
|
(df['most_role_3_value'] <= 0.1), |
|
|
|
|
|
(df['most_role_1_value'] <= 0.3) & |
|
(df['most_role_1_value'] > 0) & |
|
(df['most_role_3_value'] >= 0.15) |
|
] |
|
|
|
|
|
values = [0, 1, 2, 3, 4] |
|
df['role_specialization'] = np.select(conditions, values, default=5) |
|
|
|
return df |
|
|
|
def calculate_champion_loyalty(df): |
|
df = df.copy() |
|
|
|
def get_loyalty_scores(row): |
|
try: |
|
|
|
recent_champs = [ |
|
row['most_champ_1'] if pd.notna(row['most_champ_1']) else None, |
|
row['most_champ_2'] if pd.notna(row['most_champ_2']) else None |
|
] |
|
|
|
|
|
season_champs = [] |
|
season_games = [] |
|
for i in range(1, 8): |
|
champ = row[f'season_champ_{i}'] if pd.notna(row[f'season_champ_{i}']) else None |
|
games = row[f'games_ssn_{i}'] if pd.notna(row[f'games_ssn_{i}']) else 0 |
|
if champ is not None: |
|
season_champs.append(champ) |
|
season_games.append(games) |
|
|
|
|
|
champ_loyalty_flags = { |
|
'recent_champ_1_loyal': 1 if (pd.notna(row['most_champ_1']) and |
|
row['most_champ_1'] in season_champs) else 0, |
|
'recent_champ_2_loyal': 1 if (pd.notna(row['most_champ_2']) and |
|
row['most_champ_2'] in season_champs) else 0 |
|
} |
|
|
|
|
|
recent_champs = [c for c in recent_champs if c is not None] |
|
|
|
|
|
if not recent_champs or not season_champs: |
|
return { |
|
'loyalty_score': 0, |
|
'confidence_score': 0, |
|
**champ_loyalty_flags |
|
} |
|
|
|
|
|
recent_games = [ |
|
(row['W_1'] + row['L_1']) if pd.notna(row['most_champ_1']) else 0, |
|
(row['W_2'] + row['L_2']) if pd.notna(row['most_champ_2']) else 0 |
|
] |
|
|
|
total_recent_games = sum(recent_games) |
|
total_season_games = sum(season_games) |
|
|
|
if total_recent_games == 0: |
|
return { |
|
'loyalty_score': 0, |
|
'confidence_score': 0, |
|
**champ_loyalty_flags |
|
} |
|
|
|
|
|
loyalty_score = 0 |
|
for idx, champ in enumerate(recent_champs): |
|
if champ in season_champs: |
|
season_idx = season_champs.index(champ) |
|
|
|
recent_weight = recent_games[idx] / total_recent_games |
|
season_weight = season_games[season_idx] / total_season_games |
|
position_weight = 1.7 if idx == 0 else 1.3 |
|
seasonal_position_weight = 1.3 if season_idx < 3 else 1.0 |
|
|
|
combined_weight = ( |
|
recent_weight * 0.6 + |
|
season_weight * 0.4 |
|
) * position_weight * seasonal_position_weight |
|
|
|
loyalty_score += combined_weight |
|
|
|
|
|
confidence_score = 0 |
|
confidence_score += 0.5 if pd.notna(row['most_champ_1']) else 0 |
|
confidence_score += 0.2 if pd.notna(row['most_champ_2']) else 0 |
|
confidence_score += sum(0.1 for i in range(1, 4) if pd.notna(row[f'season_champ_{i}'])) |
|
confidence_score += sum(0.05 for i in range(4, 8) if pd.notna(row[f'season_champ_{i}'])) |
|
|
|
recent_games = sum((row[f'W_{i}'] + row[f'L_{i}']) if pd.notna(row[f'most_champ_{i}']) else 0 |
|
for i in range(1, 3)) |
|
confidence_score += min(0.1, recent_games / 100) |
|
|
|
return { |
|
'loyalty_score': round(min(loyalty_score, 1.0), 3), |
|
'confidence_score': round(min(confidence_score, 1.0), 3), |
|
**champ_loyalty_flags |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error calculating loyalty scores: {e}") |
|
return { |
|
'loyalty_score': 0, |
|
'confidence_score': 0, |
|
'recent_champ_1_loyal': 0, |
|
'recent_champ_2_loyal': 0 |
|
} |
|
|
|
|
|
results = df.apply(get_loyalty_scores, axis=1) |
|
|
|
|
|
df['champion_loyalty_score'] = results.apply(lambda x: x['loyalty_score']) |
|
df['loyalty_confidence_score'] = results.apply(lambda x: x['confidence_score']) |
|
df['recent_champ_1_loyal'] = results.apply(lambda x: x['recent_champ_1_loyal']) |
|
df['recent_champ_2_loyal'] = results.apply(lambda x: x['recent_champ_2_loyal']) |
|
|
|
return df |
|
|
|
def optimize_feature_dtypes(df): |
|
""" |
|
Optimize data types for feature columns using unsigned integers for non-negative values |
|
""" |
|
df = df.copy() |
|
|
|
|
|
category_cols = { |
|
'region': 4, |
|
'team': 2, |
|
'champ_variety_score': 6, |
|
'playstyle': 6, |
|
'most_role_1': 5, |
|
'most_role_2': 5, |
|
'most_role_3': 5, |
|
'role_specialization': 5, |
|
'recent_champ_1_loyal':2, |
|
'recent_champ_2_loyal':2 |
|
} |
|
|
|
for col, n_unique in category_cols.items(): |
|
if col in df.columns: |
|
if df[col].isna().any(): |
|
|
|
df[col] = df[col].astype('category') |
|
|
|
df[col] = df[col].cat.add_categories(['Unknown']).fillna('Unknown') |
|
else: |
|
df[col] = df[col].astype('category') |
|
|
|
|
|
champion_cols = [ |
|
'champion', |
|
'team_champ1', |
|
'team_champ2', |
|
'team_champ3', |
|
'team_champ4', |
|
'opp_champ1', |
|
'opp_champ2', |
|
'opp_champ3', |
|
'opp_champ4', |
|
'opp_champ5', |
|
'most_champ_1', |
|
'most_champ_2', |
|
'season_champ1', |
|
'season_champ2', |
|
'season_champ3', |
|
'1_champ_name', |
|
'2_champ_name', |
|
'3_champ_name', |
|
'4_champ_name', |
|
'5_champ_name' |
|
] |
|
|
|
for col in champion_cols: |
|
if col in df.columns: |
|
df[col] = df[col].astype('UInt8') |
|
|
|
|
|
float32_cols = [ |
|
'most_role_1_value', |
|
'most_role_2_value', |
|
'most_role_3_value', |
|
'avg_kills', |
|
'avg_deaths', |
|
'avg_assists', |
|
'kda_ratio_profile', |
|
'kill_participation_profile', |
|
'WR_1', |
|
'WR_2', |
|
'WR_3', |
|
'champion_loyalty_score', |
|
'loyalty_confidence_score' |
|
] |
|
|
|
for col in float32_cols: |
|
if col in df.columns: |
|
df[col] = df[col].astype('float32') |
|
|
|
return df |
|
|
|
def remove_unwanted_columns(df): |
|
""" |
|
Removes specified columns from the DataFrame |
|
|
|
Args: |
|
df (pd.DataFrame): Input DataFrame |
|
|
|
Returns: |
|
pd.DataFrame: DataFrame with specified columns removed |
|
""" |
|
df = df.copy() |
|
|
|
|
|
columns_to_remove = ( |
|
|
|
['date'] + |
|
['total_games', 'wins', 'losses', 'win_rate'] + |
|
['WR_1', 'WR_2', 'WR_3'] + |
|
['most_champ_3'] + |
|
['W_1', 'L_1', 'KDA_1', 'W_2', 'L_2', 'KDA_2', 'W_3', 'L_3', 'KDA_3'] + |
|
|
|
|
|
['TOP', 'JUNGLE', 'MID', 'ADC', 'SUPPORT'] + |
|
|
|
|
|
['cs_ssn_1', 'cpm_ssn_1', 'kda_ssn_1', 'k_ssn_1', 'd_ssn_1', 'a_ssn_1', 'wr_ssn_1', 'games_ssn_1', |
|
'cs_ssn_2', 'cpm_ssn_2', 'kda_ssn_2', 'k_ssn_2', 'd_ssn_2', 'a_ssn_2', 'wr_ssn_2', 'games_ssn_2', |
|
'cs_ssn_3', 'cpm_ssn_3', 'kda_ssn_3', 'k_ssn_3', 'd_ssn_3', 'a_ssn_3', 'wr_ssn_3', 'games_ssn_3', |
|
'season_champ_4', 'cs_ssn_4', 'cpm_ssn_4', 'kda_ssn_4', 'k_ssn_4', 'd_ssn_4', 'a_ssn_4', 'wr_ssn_4', 'games_ssn_4', |
|
'season_champ_5', 'cs_ssn_5', 'cpm_ssn_5', 'kda_ssn_5', 'k_ssn_5', 'd_ssn_5', 'a_ssn_5', 'wr_ssn_5', 'games_ssn_5', |
|
'season_champ_6', 'cs_ssn_6', 'cpm_ssn_6', 'kda_ssn_6', 'k_ssn_6', 'd_ssn_6', 'a_ssn_6', 'wr_ssn_6', 'games_ssn_6', |
|
'season_champ_7', 'cs_ssn_7', 'cpm_ssn_7', 'kda_ssn_7', 'k_ssn_7', 'd_ssn_7', 'a_ssn_7', 'wr_ssn_7', 'games_ssn_7'] + |
|
|
|
|
|
['7d_champ_1', '7d_total_1', '7d_WR_1', '7d_champ_2', '7d_total_2', '7d_WR_2', |
|
'7d_champ_3', '7d_total_3', '7d_WR_3'] + |
|
['7d_W_1', '7d_L_1', '7d_W_2', '7d_L_2', '7d_W_3', '7d_L_3'] + |
|
|
|
|
|
['mastery_champ_1', 'm_lv_1', 'mastery_champ_2', 'm_lv_2', 'mastery_champ_3', 'm_lv_3', |
|
'mastery_champ_4', 'm_lv_4', 'mastery_champ_5', 'm_lv_5', 'mastery_champ_6', 'm_lv_6', |
|
'mastery_champ_7', 'm_lv_7', 'mastery_champ_8', 'm_lv_8', 'mastery_champ_9', 'm_lv_9', |
|
'mastery_champ_10', 'm_lv_10', 'mastery_champ_11', 'm_lv_11', 'mastery_champ_12', 'm_lv_12', |
|
'mastery_champ_13', 'm_lv_13', 'mastery_champ_14', 'm_lv_14', 'mastery_champ_15', 'm_lv_15', |
|
'mastery_champ_16', 'm_lv_16'] + |
|
|
|
|
|
['1_champ_score', '2_champ_score', '3_champ_score', '4_champ_score', '5_champ_score'] + |
|
['avg_tier', 'team'] + |
|
|
|
|
|
["Aatrox", "Ahri", "Akali", "Akshan", "Alistar", "Ambessa", "Amumu", "Anivia", "Annie", "Aphelios", |
|
"Ashe", "Aurelion Sol", "Aurora", "Azir", "Bard", "Bel'Veth", "Blitzcrank", "Brand", "Braum", |
|
"Briar", "Caitlyn", "Camille", "Cassiopeia", "Cho'Gath", "Corki", "Darius", "Diana", "Dr. Mundo", |
|
"Draven", "Ekko", "Elise", "Evelynn", "Ezreal", "Fiddlesticks", "Fiora", "Fizz", "Galio", |
|
"Gangplank", "Garen", "Gnar", "Gragas", "Graves", "Gwen", "Hecarim", "Heimerdinger", "Hwei", |
|
"Illaoi", "Irelia", "Ivern", "Janna", "Jarvan IV", "Jax", "Jayce", "Jhin", "Jinx", "K'Sante", |
|
"Kai'Sa", "Kalista", "Karma", "Karthus", "Kassadin", "Katarina", "Kayle", "Kayn", "Kennen", |
|
"Kha'Zix", "Kindred", "Kled", "Kog'Maw", "LeBlanc", "Lee Sin", "Leona", "Lillia", "Lissandra", |
|
"Lucian", "Lulu", "Lux", "Malphite", "Malzahar", "Maokai", "Master Yi", "Milio", "Miss Fortune", |
|
"Mordekaiser", "Morgana", "Naafiri", "Nami", "Nasus", "Nautilus", "Neeko", "Nidalee", "Nilah", |
|
"Nocturne", "Nunu & Willump", "Olaf", "Orianna", "Ornn", "Pantheon", "Poppy", "Pyke", "Qiyana", |
|
"Quinn", "Rakan", "Rammus", "Rek'Sai", "Rell", "Renata Glasc", "Renekton", "Rengar", "Riven", |
|
"Rumble", "Ryze", "Samira", "Sejuani", "Senna", "Seraphine", "Sett", "Shaco", "Shen", "Shyvana", |
|
"Singed", "Sion", "Sivir", "Skarner", "Smolder", "Sona", "Soraka", "Swain", "Sylas", "Syndra", |
|
"Tahm Kench", "Taliyah", "Talon", "Taric", "Teemo", "Thresh", "Tristana", "Trundle", "Tryndamere", |
|
"Twisted Fate", "Twitch", "Udyr", "Urgot", "Varus", "Vayne", "Veigar", "Vel'Koz", "Vex", "Vi", |
|
"Viego", "Viktor", "Vladimir", "Volibear", "Warwick", "Wukong", "Xayah", "Xerath", "Xin Zhao", |
|
"Yasuo", "Yone", "Yorick", "Yuumi", "Zac", "Zed", "Zeri", "Ziggs", "Zilean", "Zoe", "Zyra"] |
|
) |
|
|
|
|
|
columns_to_remove = [col for col in columns_to_remove if col in df.columns] |
|
|
|
|
|
df = df.drop(columns=columns_to_remove) |
|
|
|
|
|
print(f"Removed {len(columns_to_remove)} columns") |
|
print(f"Remaining columns: {len(df.columns)}") |
|
|
|
return df |
|
|
|
|
|
def apply_feature_engineering(df, n=5): |
|
""" |
|
Performs feature engineering pipeline |
|
""" |
|
df = df.copy() |
|
|
|
|
|
transformations = [ |
|
calculate_champ_variety_score, |
|
calculate_playstyle, |
|
get_most_role_3, |
|
calculate_role_specialization, |
|
calculate_champion_loyalty, |
|
lambda x: get_top_champion_scores(x, n), |
|
remove_unwanted_columns, |
|
optimize_feature_dtypes |
|
] |
|
|
|
for transform in transformations: |
|
try: |
|
print(f"Applying {transform.__name__}...") |
|
df = transform(df) |
|
except Exception as e: |
|
print(f"Error in {transform.__name__}: {str(e)}") |
|
raise |
|
|
|
return df |
|
|