Spaces:
Sleeping
Sleeping
import pandas as pd | |
from datetime import datetime | |
import os | |
import numpy as np | |
from urllib.parse import quote, unquote | |
class ChampionConverter: | |
def __init__(self): | |
self.champions = [ | |
"Aatrox", "Ahri", "Akali", "Akshan", "Alistar", "Ambessa", "Amumu", "Anivia", "Annie", "Aphelios", "Ashe", "Aurelion Sol", | |
"Aurora", "Azir", "Bard", "Bel'Veth", "Blitzcrank", "Brand", "Braum", "Briar", "Caitlyn", "Camille", "Cassiopeia", "Cho'Gath", | |
"Corki", "Darius", "Diana", "Dr. Mundo", "Draven", "Ekko", "Elise", "Evelynn", "Ezreal", "Fiddlesticks", "Fiora", "Fizz", "Galio", | |
"Gangplank", "Garen", "Gnar", "Gragas", "Graves", "Gwen", "Hecarim", "Heimerdinger", "Hwei", "Illaoi", "Irelia", "Ivern", "Janna", | |
"Jarvan IV", "Jax", "Jayce", "Jhin", "Jinx", "K'Sante", "Kai'Sa", "Kalista", "Karma", "Karthus", "Kassadin", "Katarina", "Kayle", | |
"Kayn", "Kennen", "Kha'Zix", "Kindred", "Kled", "Kog'Maw", "LeBlanc", "Lee Sin", "Leona", "Lillia", "Lissandra", "Lucian", "Lulu", | |
"Lux", "Malphite", "Malzahar", "Maokai", "Master Yi", "Milio", "Miss Fortune", "Mordekaiser", "Morgana", "Naafiri", "Nami", "Nasus", | |
"Nautilus", "Neeko", "Nidalee", "Nilah", "Nocturne", "Nunu & Willump", "Olaf", "Orianna", "Ornn", "Pantheon", "Poppy", "Pyke", | |
"Qiyana", "Quinn", "Rakan", "Rammus", "Rek'Sai", "Rell", "Renata Glasc", "Renekton", "Rengar", "Riven", "Rumble", "Ryze", "Samira", | |
"Sejuani", "Senna", "Seraphine", "Sett", "Shaco", "Shen", "Shyvana", "Singed", "Sion", "Sivir", "Skarner", "Smolder", "Sona", | |
"Soraka", "Swain", "Sylas", "Syndra", "Tahm Kench", "Taliyah", "Talon", "Taric", "Teemo", "Thresh", "Tristana", "Trundle", | |
"Tryndamere", "Twisted Fate", "Twitch", "Udyr", "Urgot", "Varus", "Vayne", "Veigar", "Vel'Koz", "Vex", "Vi", "Viego", "Viktor", | |
"Vladimir", "Volibear", "Warwick", "Wukong", "Xayah", "Xerath", "Xin Zhao", "Yasuo", "Yone", "Yorick", "Yuumi", "Zac", "Zed", | |
"Zeri", "Ziggs", "Zilean", "Zoe", "Zyra" | |
] | |
self.champion_to_number = {champion: i for i, champion in enumerate(self.champions, start=1)} | |
self.number_to_champion = {i: champion for i, champion in enumerate(self.champions, start=1)} | |
def champion_to_num(self, champion_name): | |
return self.champion_to_number.get(champion_name, None) | |
def num_to_champion(self, number): | |
return self.number_to_champion.get(number, None) | |
def convert_date(date_str): | |
"""Convert datetime string to Unix timestamp""" | |
try: | |
if pd.isna(date_str): | |
return None | |
return pd.to_datetime(date_str).timestamp() | |
except: | |
return None | |
def convert_to_minutes(time_str): | |
"""Convert time string (e.g., '15m 10s') to minutes (float)""" | |
try: | |
minutes = seconds = 0 | |
parts = time_str.lower().split() | |
for part in parts: | |
if 'm' in part: | |
minutes = float(part.replace('m', '')) | |
elif 's' in part: | |
seconds = float(part.replace('s', '')) | |
return round(minutes + seconds/60, 2) | |
except: | |
return 0.0 | |
def convert_percentage_to_decimal(percentage_str): | |
"""Convert percentage string (e.g., 'P/Kill 43%') to decimal (0.43)""" | |
try: | |
# Extract number from string and convert to decimal | |
num = float(''.join(filter(str.isdigit, percentage_str))) / 100 | |
return round(num, 2) | |
except: | |
return 0.0 | |
def convert_tier_to_number(tier_str): | |
""" | |
Convert tier string to number: | |
Challenger -> 1 | |
Grandmaster -> 2 | |
Master -> 3 | |
Others -> 4 | |
""" | |
tier_map = { | |
'challenger': 1, | |
'grandmaster': 2, | |
'master': 3 | |
} | |
# Convert to lowercase and return mapped value or 4 for any other tier | |
return tier_map.get(tier_str.lower().strip(), 4) | |
def convert_result_to_binary(result_str): | |
""" | |
Convert match result to binary: | |
Victory -> 1 | |
Defeat -> 0 | |
""" | |
return 1 if result_str.lower().strip() == 'victory' else 0 | |
def merge_stats(recent_stats, player_stats, current_time =None): | |
""" | |
Merge recent match stats with player profile stats and save to CSV. | |
Only keeps rows where matches exist in both DataFrames. | |
Args: | |
recent_stats (DataFrame/dict): Recent match statistics | |
player_stats (DataFrame/tuple): Player profile statistics | |
Returns: | |
DataFrame: Combined statistics | |
""" | |
try: | |
if current_time is None: | |
current_time = datetime.utcnow().strftime("%Y-%m-%d") | |
# Convert recent_stats to DataFrame if it's not already | |
if not isinstance(recent_stats, pd.DataFrame): | |
recent_df = pd.DataFrame(recent_stats) | |
else: | |
recent_df = recent_stats | |
# Handle player_stats based on its type | |
if isinstance(player_stats, tuple): | |
# If it's a tuple (merged_df, dfs), use the merged_df | |
player_df = player_stats[0] | |
elif isinstance(player_stats, pd.DataFrame): | |
player_df = player_stats | |
else: | |
raise ValueError("Invalid player_stats format") | |
# Ensure player_id exists in both DataFrames | |
if 'player_id' not in recent_df.columns: | |
recent_df['player_id'] = player_df['player_id'].iloc[0] | |
# Print info before merge | |
print(f"\nBefore merge:") | |
print(f"Recent stats rows: {len(recent_df)}") | |
print(f"Player stats rows: {len(player_df)}") | |
print(f"Unique players in recent stats: {recent_df['player_id'].nunique()}") | |
print(f"Unique players in player stats: {player_df['player_id'].nunique()}") | |
# Merge DataFrames with inner join | |
merged_df = pd.merge( | |
recent_df, | |
player_df, | |
on='player_id', | |
how='inner', # Changed from 'left' to 'inner' | |
suffixes=('', '_profile') | |
) | |
# Print info after merge | |
print(f"\nAfter merge:") | |
print(f"Merged stats rows: {len(merged_df)}") | |
print(f"Unique players in merged stats: {merged_df['player_id'].nunique()}") | |
# Reorder columns to ensure player_id and region are first | |
cols = merged_df.columns.tolist() | |
cols = ['player_id'] + [col for col in cols if col != 'player_id'] | |
if 'region' in cols: | |
cols.remove('region') | |
cols.insert(1, 'region') | |
merged_df = merged_df[cols] | |
# Create directory if it doesn't exist | |
save_dir = "util/data" | |
os.makedirs(save_dir, exist_ok=True) | |
# Save to CSV | |
filename = f"player_stats_merged_{current_time}.csv" | |
filepath = os.path.join(save_dir, filename) | |
merged_df.to_csv(filepath, index=False) | |
print(f"\nSuccessfully saved merged stats to {filepath}") | |
return merged_df | |
except Exception as e: | |
print(f"Error in merge_stats: {e}") | |
return None | |
def filter_leaderboard(df, tiers=None): | |
""" | |
Filter leaderboard DataFrame to keep only specific tiers. | |
Args: | |
df (pandas.DataFrame): Input leaderboard DataFrame | |
tiers (list): List of tiers to keep. Defaults to ["CHALLENGER", "GRANDMASTER"] | |
timestamp (str): Current timestamp in UTC | |
scraper_user (str): Current user's login | |
Returns: | |
pandas.DataFrame: Filtered leaderboard data | |
""" | |
try: | |
# Set default tiers if none provided | |
if tiers is None: | |
tiers = ["CHALLENGER", "GRANDMASTER"] | |
# Convert tiers to uppercase for consistency | |
tiers = [tier.upper() for tier in tiers] | |
# Validate input DataFrame | |
required_cols = ["tier", "summoner", "region"] | |
if not all(col in df.columns for col in required_cols): | |
raise ValueError(f"DataFrame must contain columns: {required_cols}") | |
# Create copy to avoid modifying original DataFrame | |
filtered_df = df.copy() | |
# Convert tier column to uppercase for consistent filtering | |
filtered_df['tier'] = filtered_df['tier'].str.upper() | |
# Filter by specified tiers | |
filtered_df = filtered_df[filtered_df['tier'].isin(tiers)] | |
# Sort by region and tier | |
filtered_df = filtered_df.sort_values(['region', 'tier', 'rank']) | |
# Reset index | |
filtered_df = filtered_df.reset_index(drop=True) | |
# Save to CSV | |
output_file = os.path.join("util", "data", "lb_filtered.csv") | |
os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
filtered_df.to_csv(output_file, index=False) | |
print(f"\nFiltered leaderboard to {len(tiers)} tiers: {', '.join(tiers)}") | |
print(f"Remaining entries: {len(filtered_df)}") | |
print(f"Saved filtered leaderboard to {output_file}") | |
# Print summary statistics | |
print("\nSummary by region and tier:") | |
summary = filtered_df.groupby(['region', 'tier']).size().unstack(fill_value=0) | |
print(summary) | |
return filtered_df | |
except Exception as e: | |
print(f"Error filtering leaderboard: {e}") | |
return None | |
def format_summoner_name(summoner): | |
""" | |
Format summoner name for URL usage | |
Parameters: | |
summoner: str - Original summoner name | |
Returns: | |
str - Formatted summoner name | |
""" | |
if not summoner: | |
raise ValueError("Summoner name cannot be empty") | |
# Remove leading/trailing whitespace | |
summoner = summoner.strip() | |
# Replace spaces and special characters | |
formatted_summoner = summoner.replace(" ", "-").replace("#", "-") | |
# Handle other special characters through URL encoding | |
formatted_summoner = quote(formatted_summoner) | |
return formatted_summoner | |
def convert_to_displayname(name): | |
""" | |
Convert a summoner name to display format | |
Examples: | |
marthinsurya-NA -> marthinsurya #NA | |
toplane%20kid-EUW77 -> toplane kid #EUW77 | |
Walid-Georgey-EUW -> Walid Georgey #EUW | |
Current%20User-KR -> Current User #KR | |
""" | |
try: | |
if not name: | |
return "" | |
# First decode URL encoding | |
decoded = unquote(name) | |
# Remove any trailing hyphens | |
decoded = decoded.rstrip('-') | |
# Split by last hyphen to separate name and region | |
if '-' in decoded: | |
parts = decoded.rsplit('-', 1) | |
base_name = parts[0] # Everything before last hyphen | |
region = parts[1] | |
# Replace remaining hyphens in base_name with spaces | |
base_name = base_name.replace('-', ' ') | |
# Clean up any double spaces | |
base_name = ' '.join(filter(None, base_name.split())) | |
return f"{base_name} #{region}" | |
return decoded.replace('-', ' ') | |
except Exception as e: | |
print(f"Error converting name '{name}': {e}") | |
return name | |
def get_player_list(leaderboard=None): | |
""" | |
Convert leaderboard data into proper player list format for API calls. | |
Args: | |
leaderboard (DataFrame): Input leaderboard DataFrame containing summoner and region | |
Returns: | |
DataFrame: Formatted player list with region and username columns | |
""" | |
try: | |
if leaderboard is None: | |
leaderboard_file = os.path.join("util", "data", "lb_filtered.csv") | |
leaderboard = pd.read_csv(leaderboard_file) | |
# Rename summoner column to username | |
leaderboard = leaderboard.rename(columns={'summoner': 'username'}) | |
# Select only region and username columns in correct order | |
player_list = leaderboard[['region', 'username']] | |
print(f"Successfully processed {len(player_list)} players") | |
return player_list | |
except Exception as e: | |
print(f"Error processing leaderboard: {e}") | |
return None | |
def process_kda_perfect(df): | |
""" | |
Process KDA values in the DataFrame, replacing 'Perfect' with appropriate values. | |
""" | |
try: | |
# Create a copy to avoid modifying the original dataframe | |
df = df.copy() | |
# Function to safely convert to numeric | |
def safe_convert(x): | |
if isinstance(x, (int, float)): | |
return x | |
if isinstance(x, str) and x.lower() == 'perfect': | |
return 6 | |
try: | |
return float(x) | |
except: | |
return None | |
# 1. Process KDA_1, KDA_2, KDA_3 | |
for col in ['KDA_1', 'KDA_2', 'KDA_3']: | |
if col in df.columns: | |
df[col] = df[col].apply(safe_convert) | |
# 2. Process kda_ssn_1 to kda_ssn_7 | |
for i in range(1, 8): | |
col = f'kda_ssn_{i}' | |
if col in df.columns: | |
perfect_mask = df[col].astype(str).str.contains('perfect', case=False) | |
if perfect_mask.any(): | |
kills_col, assists_col = f'k_ssn_{i}', f'a_ssn_{i}' | |
if kills_col in df.columns and assists_col in df.columns: | |
df.loc[perfect_mask, col] = df.loc[perfect_mask].apply( | |
lambda row: pd.to_numeric(row[kills_col], errors='coerce') + | |
pd.to_numeric(row[assists_col], errors='coerce'), | |
axis=1 | |
) | |
else: | |
df.loc[perfect_mask, col] = 6 | |
df[col] = pd.to_numeric(df[col], errors='coerce') | |
# 3. Process kda_ratio_profile | |
if 'kda_ratio_profile' in df.columns: | |
perfect_mask = df['kda_ratio_profile'].astype(str).str.contains('perfect', case=False) | |
if perfect_mask.any(): | |
df.loc[perfect_mask, 'kda_ratio_profile'] = df.loc[perfect_mask].apply( | |
lambda row: pd.to_numeric(row['avg_kills'], errors='coerce') + | |
pd.to_numeric(row['avg_assists'], errors='coerce'), | |
axis=1 | |
) | |
df['kda_ratio_profile'] = pd.to_numeric(df['kda_ratio_profile'], errors='coerce') | |
# 4. Process remaining kda_ratio columns | |
other_cols = [col for col in df.columns if 'kda_ratio' in col.lower() | |
and col != 'kda_ratio_profile' | |
and col not in [f'kda_ssn_{i}' for i in range(1, 8)]] | |
for col in other_cols: | |
perfect_mask = df[col].astype(str).str.contains('perfect', case=False) | |
if perfect_mask.any(): | |
prefix = col.split('kda_ratio')[0] | |
kills_col, assists_col = f"{prefix}kills", f"{prefix}assists" | |
if kills_col in df.columns and assists_col in df.columns: | |
df.loc[perfect_mask, col] = df.loc[perfect_mask].apply( | |
lambda row: pd.to_numeric(row[kills_col], errors='coerce') + | |
pd.to_numeric(row[assists_col], errors='coerce'), | |
axis=1 | |
) | |
else: | |
df.loc[perfect_mask, col] = 6 | |
df[col] = pd.to_numeric(df[col], errors='coerce') | |
return df | |
except Exception as e: | |
print(f"Error in process_kda_perfect: {str(e)}") | |
return df | |
def check_mixed_types(df): | |
""" | |
Check and print dataframe column types, inconsistencies, and basic statistics | |
""" | |
# Get type information | |
dtype_info = pd.DataFrame({ | |
'dtype': df.dtypes, | |
'non_null': df.count(), | |
'null_count': df.isnull().sum(), | |
'unique_values': [df[col].nunique() for col in df.columns] | |
}) | |
# Add sample of unique values for each column | |
dtype_info['sample_values'] = [df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist() | |
if len(df[col].dropna()) > 0 else [] | |
for col in df.columns] | |
# Check for mixed types in object columns | |
mixed_type_cols = [] | |
for col in df.select_dtypes(include=['object']): | |
types = df[col].apply(type).unique() | |
if len(types) > 1: | |
mixed_type_cols.append({ | |
'column': col, | |
'types': [t.__name__ for t in types], | |
'samples': df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist() | |
}) | |
print("=== DataFrame Overview ===") | |
print(f"Shape: {df.shape}") | |
print("\n=== Data Types Summary ===") | |
print(df.dtypes.value_counts()) | |
if mixed_type_cols: | |
print("\n=== Mixed Type Columns ===") | |
for col_info in mixed_type_cols: | |
print(f"\nColumn: {col_info['column']}") | |
print(f"Types found: {col_info['types']}") | |
print(f"Sample values: {col_info['samples']}") | |
return dtype_info | |
def check_nan_float(df, column_name): | |
float_mask = df[column_name].apply(lambda x: isinstance(x, float)) | |
is_nan_mask = df[column_name].isna() | |
# Check if all float values are NaN | |
all_floats_are_nan = (float_mask == is_nan_mask).all() | |
print(f"Are all float values NaN? {all_floats_are_nan}") | |
# Double check by comparing counts | |
print(f"Number of float values: {float_mask.sum()}") | |
print(f"Number of NaN values: {is_nan_mask.sum()}") | |
def convert_team_colors(df): | |
""" | |
Convert 'team' column values from 'blue'/'red' to 1/2 | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame with 'team' column | |
Returns: | |
pandas.DataFrame: DataFrame with converted team values | |
""" | |
df = df.copy() | |
if 'team' not in df.columns: | |
raise ValueError("Column 'team' not found in DataFrame") | |
# Create mapping dictionary | |
team_mapping = { | |
'blue': 1, | |
'red': 2 | |
} | |
# Convert team colors to numbers | |
df['team'] = df['team'].map(team_mapping, na_action='ignore') | |
return df | |
def convert_region(df): | |
""" | |
Convert 'region' column values to numeric: | |
kr -> 1 | |
euw -> 2 | |
vn -> 3 | |
na -> 4 | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame with 'region' column | |
Returns: | |
pandas.DataFrame: DataFrame with converted region values | |
""" | |
df = df.copy() | |
if 'region' not in df.columns: | |
raise ValueError("Column 'region' not found in DataFrame") | |
# Create mapping dictionary | |
region_mapping = { | |
'kr': 1, | |
'euw': 2, | |
'vn': 3, | |
'na': 4 | |
} | |
# Convert regions to numbers, keeping NA as NA | |
df['region'] = df['region'].map(region_mapping, na_action='ignore') | |
return df | |
def convert_champion_columns(df): | |
""" | |
Convert all champion-related columns to numbers using ChampionConverter | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame | |
Returns: | |
pandas.DataFrame: DataFrame with converted champion values | |
""" | |
df = df.copy() | |
# Initialize champion converter | |
converter = ChampionConverter() | |
# Get all champion-related columns | |
champion_columns = [col for col in df.columns if 'champ' in col.lower()] | |
for col in champion_columns: | |
# Convert champion names to numbers | |
df[col] = df[col].map(converter.champion_to_num, na_action='ignore') | |
return df | |
def convert_date_column(df): | |
""" | |
Convert date column from string format to Unix timestamp | |
Handles missing values (NaT, None, NaN) | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame with 'date' column | |
Returns: | |
pandas.DataFrame: DataFrame with converted date values | |
""" | |
df = df.copy() | |
if 'date' not in df.columns: | |
raise ValueError("Column 'date' not found in DataFrame") | |
# Convert dates to timestamps | |
df['date'] = df['date'].apply(convert_date) | |
return df | |
def convert_role_columns(df): | |
""" | |
Convert role columns to numbers: | |
TOP -> 1, MID -> 2, ADC -> 3, JUNGLE -> 4, SUPPORT -> 5 | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame | |
Returns: | |
pandas.DataFrame: DataFrame with converted role values | |
""" | |
df = df.copy() | |
# Define role mapping | |
role_mapping = { | |
'TOP': 1, | |
'MID': 2, | |
'ADC': 3, | |
'JUNGLE': 4, | |
'SUPPORT': 5 | |
} | |
# Role columns to convert | |
role_columns = ['most_role_1', 'most_role_2'] | |
for col in role_columns: | |
if col in df.columns: | |
# Convert roles to numbers | |
df[col] = df[col].map(role_mapping, na_action='ignore') | |
else: | |
print(f"Warning: Column {col} not found in DataFrame") | |
return df | |
def convert_id_columns(df): | |
""" | |
Drop ID-related columns (player_id, teammates1-4, oppmates1-5) | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame | |
Returns: | |
pandas.DataFrame: DataFrame with ID columns dropped | |
""" | |
df = df.copy() | |
# Specific ID columns to drop | |
id_columns = ( | |
['player_id', 'region_profile'] + | |
[f'teammates{i}' for i in range(1, 5)] + # teammates1 to teammates4 | |
[f'oppmates{i}' for i in range(1, 6)] # oppmates1 to oppmates5 | |
) | |
# Verify columns exist and drop them | |
existing_columns = [col for col in id_columns if col in df.columns] | |
if len(existing_columns) != len(id_columns): | |
missing = set(id_columns) - set(existing_columns) | |
print(f"Note: Some columns were not found in DataFrame: {missing}") | |
# Drop the columns | |
df = df.drop(columns=existing_columns) | |
return df | |
def remove_match_stats(df): | |
""" | |
Remove match-specific statistics to prevent future data leakage. | |
Parameters: | |
df (pandas.DataFrame): Input DataFrame | |
Returns: | |
pandas.DataFrame: DataFrame with match-specific columns removed | |
""" | |
# List of columns that contain match-specific information | |
match_stat_columns = [ | |
'level', # Champion level | |
'result', # Match outcome (target variable) | |
'match_length_mins',# Match duration | |
'kill', # Kills in the match | |
'death', # Deaths in the match | |
'assist', # Assists in the match | |
'kda_ratio', # KDA ratio for the match | |
'kill_participation',# Kill participation in the match | |
'laning', # Laning phase performance | |
'cs', # Creep score in the match | |
'cs_per_min' # CS per minute in the match | |
] | |
# Create a copy of the dataframe | |
df_clean = df.copy() | |
# Remove match-specific columns | |
columns_to_drop = [col for col in match_stat_columns if col in df_clean.columns] | |
df_clean = df_clean.drop(columns=columns_to_drop) | |
return df_clean | |
def convert_df(df): | |
""" | |
Master function to handle all conversions for training DataFrame | |
Includes: | |
- Team color conversion (blue/red to 1/2) | |
- Region conversion (kr/euw/vn/na to 1/2/3/4) | |
- Champion conversion (champion names to numbers) | |
- Date conversion (string to Unix timestamp) | |
- Role conversion (TOP/MID/ADC/JUNGLE/SUPPORT to 1/2/3/4/5) | |
- Drop ID columns (player_id, teammates1-4, oppmates1-5, region_profile) | |
Parameters: | |
df (pandas.DataFrame): Input training DataFrame | |
Returns: | |
pandas.DataFrame: Processed DataFrame with all conversions | |
""" | |
df = df.copy() | |
# Drop rows where champion is NA | |
initial_rows = len(df) | |
df = df.dropna(subset=['champion']) | |
rows_dropped = initial_rows - len(df) | |
print(f"Dropped {rows_dropped} rows with NA champion values") | |
# Sequential conversions | |
conversions = [ | |
convert_team_colors, # Convert blue/red to 1/2 | |
convert_region, # Convert kr/euw/vn/na to 1/2/3/4 | |
convert_champion_columns, # Convert champion names to numbers | |
convert_date_column, # Convert dates to timestamps | |
convert_role_columns, # Convert roles to 1-5 | |
convert_id_columns, # Drop ID-related columns | |
remove_match_stats # Remove match-specific columns | |
] | |
## Apply each conversion function in sequence | |
for convert_func in conversions: | |
try: | |
print(f"Applying {convert_func.__name__}...") | |
df = convert_func(df) | |
except Exception as e: | |
print(f"Error in {convert_func.__name__}: {str(e)}") | |
raise | |
return df | |
def get_top_champion_scores(df, n=5): | |
""" | |
Get top n champion scores from a DataFrame | |
Parameters: | |
df: pandas DataFrame containing champion scores | |
n: number of top champions to return (default 5) | |
Returns: | |
pandas DataFrame with original data plus top n champion scores and their names | |
""" | |
try: | |
converter = ChampionConverter() | |
df = df.copy() | |
# Get all champion columns (from Aatrox to Zyra) | |
champion_start = df.columns.get_loc('Aatrox') | |
champion_end = df.columns.get_loc('Zyra') + 1 | |
champion_cols = df.columns[champion_start:champion_end] | |
# Convert scores to numeric, replacing non-numeric values with 0 | |
champion_scores = df[champion_cols].apply(pd.to_numeric, errors='coerce').fillna(0) | |
# Get indices of top n values for each row | |
top_n_indices = champion_scores.apply(lambda x: pd.Series(x.nlargest(n).index), axis=1) | |
top_n_values = champion_scores.apply(lambda x: pd.Series(x.nlargest(n).values), axis=1) | |
# Create new columns for champion names and scores | |
for i in range(n): | |
# Champion scores | |
df[f'{i+1}_champ_score'] = top_n_values.iloc[:, i].astype(float) | |
# Champion names (converted to numbers) | |
champ_names = top_n_indices.iloc[:, i] | |
df[f'{i+1}_champ_name'] = champ_names.map( | |
lambda x: int(converter.champion_to_num(x)) if pd.notnull(x) else -1 | |
) | |
return df | |
except Exception as e: | |
print(f"Error in get_top_champion_scores: {str(e)}") | |
# Return original DataFrame with default values in case of error | |
for i in range(1, n + 1): | |
df[f'{i}_champ_score'] = 0.0 | |
df[f'{i}_champ_name'] = -1 | |
return df | |
def check_datatypes(df): | |
datatype= pd.DataFrame({ | |
'dtype': df.dtypes, | |
'unique_values': df.nunique() | |
}) | |
print(datatype) | |
return datatype | |
def calculate_champ_variety_score(df): | |
df = df.copy() # Create a copy to avoid warnings | |
# Create a list of champion columns we want to check | |
champ_columns = [ | |
'most_champ_1', 'most_champ_2', 'most_champ_3', | |
'7d_champ_1', '7d_champ_2', '7d_champ_3' | |
] | |
# Filter to only include columns that exist in the DataFrame | |
existing_columns = [col for col in champ_columns if col in df.columns] | |
# Function to count unique non-NaN values | |
def count_unique_champions(row): | |
# Get all values that are not NaN | |
valid_champions = row[existing_columns].dropna() | |
# Count unique values | |
return len(set(valid_champions)) | |
# Calculate the score for each row | |
df['champ_variety_score'] = df.apply(count_unique_champions, axis=1) | |
return df | |
def calculate_playstyle(df): | |
df = df.copy() | |
# Playstyle categorization (0-5) | |
conditions = [ | |
# 0: Assassin/Carry (high kills, high KDA, high kill participation) | |
(df['avg_kills'] > df['avg_assists']) & | |
(df['kda_ratio_profile'] > 3) & | |
(df['kill_participation_profile'] > 0.6), | |
# 1: Support/Utility (high assists, good KDA, high kill participation) | |
(df['avg_assists'] > df['avg_kills']) & | |
(df['kda_ratio_profile'] > 2.5) & | |
(df['kill_participation_profile'] > 0.55), | |
# 2: Tank/Initiator (moderate deaths, high assists, high kill participation) | |
(df['avg_deaths'] > 3) & | |
(df['avg_assists'] > df['avg_kills']) & | |
(df['kill_participation_profile'] > 0.5), | |
# 3: Split-pusher (lower kill participation, good KDA) | |
(df['kill_participation_profile'] < 0.5) & | |
(df['kda_ratio_profile'] > 2), | |
# 4: Aggressive/Fighter (high kills and deaths, high kill participation) | |
(df['avg_kills'] > 3) & | |
(df['avg_deaths'] > 4) & | |
(df['kill_participation_profile'] > 0.55) | |
] | |
values = [0, 1, 2, 3, 4] # Numeric values for each playstyle | |
df['playstyle'] = np.select(conditions, values, default=5) | |
return df | |
def get_most_role_3(df): | |
df = df.copy() | |
# Role mapping | |
role_mapping = { | |
'TOP': 1, | |
'MID': 2, | |
'ADC': 3, | |
'JUNGLE': 4, | |
'SUPPORT': 5 | |
} | |
def get_third_role_info(row): | |
# Create dictionary of role values excluding most_role_1 and most_role_2 | |
role_values = { | |
'TOP': row['TOP'], | |
'JUNGLE': row['JUNGLE'], | |
'MID': row['MID'], | |
'ADC': row['ADC'], | |
'SUPPORT': row['SUPPORT'] | |
} | |
# Remove most_role_1 and most_role_2 from consideration | |
role_values.pop(row['most_role_1'], None) | |
role_values.pop(row['most_role_2'], None) | |
# Find highest remaining role and its value | |
if role_values: | |
third_role, third_value = max(role_values.items(), key=lambda x: x[1]) | |
return role_mapping[third_role], third_value | |
return 0, 0.0 # Default values if no third role found | |
# Add both most_role_3 and most_role_3_value | |
df[['most_role_3', 'most_role_3_value']] = df.apply(get_third_role_info, axis=1, result_type='expand') | |
return df | |
def calculate_role_specialization(df): | |
df = df.copy() | |
# Define conditions for role specialization | |
conditions = [ | |
# 0: Pure Specialist (one dominant role) | |
(df['most_role_1_value'] > 0.6), | |
# 1: Strong Dual Role (two significant roles) | |
(df['most_role_1_value'] <= 0.6) & | |
(df['most_role_2_value'] >= 0.3), | |
# 2: Primary Role with Backups (moderate first role, has backups) | |
(df['most_role_1_value'] <= 0.6) & | |
(df['most_role_2_value'] < 0.3) & | |
(df['most_role_1_value'] > 0.3) & | |
(df['most_role_3_value'] > 0.1), # Has a viable third role | |
# 3: Role Swapper (moderate first role, low others) | |
(df['most_role_1_value'] <= 0.6) & | |
(df['most_role_2_value'] < 0.3) & | |
(df['most_role_1_value'] > 0.3) & | |
(df['most_role_3_value'] <= 0.1), # No viable third role | |
# 4: True Flex (plays multiple roles evenly) | |
(df['most_role_1_value'] <= 0.3) & | |
(df['most_role_1_value'] > 0) & | |
(df['most_role_3_value'] >= 0.15) # Significant third role | |
] | |
# 5 will be No Preference/Undefined (very low values or missing data) | |
values = [0, 1, 2, 3, 4] # Numeric values for each category | |
df['role_specialization'] = np.select(conditions, values, default=5) | |
return df | |
def calculate_champion_loyalty(df): | |
df = df.copy() | |
#print("df.dtypes: \n", df.dtypes, "\n") | |
def get_loyalty_scores(row): | |
print("========================== Inside: get_loyalty_scores() ===================\n") | |
try: | |
# Get champions lists, handle potential NaN/None values (only top 2) | |
recent_champs = [ | |
row['most_champ_1'] if pd.notna(row['most_champ_1']) else None, | |
row['most_champ_2'] if pd.notna(row['most_champ_2']) else None | |
] | |
# Include all 7 season champions | |
season_champs = [] | |
season_games = [] | |
for i in range(1, 8): | |
champ = row[f'season_champ_{i}'] if pd.notna(row[f'season_champ_{i}']) else None | |
games = row[f'games_ssn_{i}'] if pd.notna(row[f'games_ssn_{i}']) else 0 | |
if champ is not None: | |
season_champs.append(champ) | |
season_games.append(games) | |
# Add individual champion loyalty flags (only top 2) | |
champ_loyalty_flags = { | |
'recent_champ_1_loyal': 1 if (pd.notna(row['most_champ_1']) and | |
row['most_champ_1'] in season_champs) else 0, | |
'recent_champ_2_loyal': 1 if (pd.notna(row['most_champ_2']) and | |
row['most_champ_2'] in season_champs) else 0 | |
} | |
# Remove None values from recent champions | |
recent_champs = [c for c in recent_champs if c is not None] | |
# If no valid champions, return defaults | |
if not recent_champs or not season_champs: | |
return { | |
'loyalty_score': 0, | |
'confidence_score': 0, | |
**champ_loyalty_flags | |
} | |
recent_games = [ | |
(int(row['W_1']) + int(row['L_1'])) if pd.notna(row['most_champ_1']) else 0, | |
(int(row['W_2']) + int(row['L_2'])) if pd.notna(row['most_champ_2']) else 0 | |
] | |
season_games = [int(x) if isinstance(x, str) and x.isdigit() else 0 for x in season_games] | |
#print(f"recent_games was: {recent_games}, types: {[type(x) for x in recent_games]}") | |
#print(f"season_games was: {season_games}, types: {[type(x) for x in season_games]}") | |
#print("\nSumming recent games... \n") | |
total_recent_games = sum(recent_games) | |
#print("total_recent_games: ", total_recent_games, "\n") | |
total_season_games = sum(season_games) | |
#print("total_season_games: ", total_season_games, "\n") | |
#print("End of summing recent games... \n Total recent_games = ", total_recent_games, "\n total_season_games: ", total_season_games, "\n \n \n") | |
if total_recent_games == 0: | |
return { | |
'loyalty_score': 0, | |
'confidence_score': 0, | |
**champ_loyalty_flags | |
} | |
# Calculate overlap score with enhanced weights | |
loyalty_score = 0 | |
for idx, champ in enumerate(recent_champs): | |
if champ in season_champs: | |
season_idx = season_champs.index(champ) | |
recent_weight = recent_games[idx] / total_recent_games | |
season_weight = season_games[season_idx] / total_season_games | |
position_weight = 1.7 if idx == 0 else 1.3 # Adjusted weights for 2 champions | |
seasonal_position_weight = 1.3 if season_idx < 3 else 1.0 | |
combined_weight = ( | |
recent_weight * 0.6 + | |
season_weight * 0.4 | |
) * position_weight * seasonal_position_weight | |
loyalty_score += combined_weight | |
#print("Start calculate confidence score...\n") | |
# Calculate confidence score (adjusted for 2 champions) | |
confidence_score = 0 | |
confidence_score += 0.5 if pd.notna(row['most_champ_1']) else 0 # Increased weight for main | |
confidence_score += 0.2 if pd.notna(row['most_champ_2']) else 0 # Increased weight for second | |
confidence_score += sum(0.1 for i in range(1, 4) if pd.notna(row[f'season_champ_{i}'])) | |
confidence_score += sum(0.05 for i in range(4, 8) if pd.notna(row[f'season_champ_{i}'])) | |
#print("...END calculate confidence score\n") | |
recent_games = sum((row[f'W_{i}'] + row[f'L_{i}']) if pd.notna(row[f'most_champ_{i}']) else 0 | |
for i in range(1, 3)) # Only top 2 | |
confidence_score += min(0.1, recent_games / 100) | |
print(f"loyalty_score, confidence score: [{loyalty_score}], [{confidence_score}] \n") | |
print("===================== exiting: get_loyalty_scores()===================") | |
return { | |
'loyalty_score': round(min(loyalty_score, 1.0), 3), | |
'confidence_score': round(min(confidence_score, 1.0), 3), | |
**champ_loyalty_flags | |
} | |
except Exception as e: | |
print(f"Error calculating loyalty scores: {e}") | |
return { | |
'loyalty_score': 0, | |
'confidence_score': 0, | |
'recent_champ_1_loyal': 0, | |
'recent_champ_2_loyal': 0 | |
} | |
# Apply calculations and expand results to columns | |
results = df.apply(get_loyalty_scores, axis=1) | |
# Convert results to new columns | |
df['champion_loyalty_score'] = results.apply(lambda x: x['loyalty_score']) | |
df['loyalty_confidence_score'] = results.apply(lambda x: x['confidence_score']) | |
df['recent_champ_1_loyal'] = results.apply(lambda x: x['recent_champ_1_loyal']) | |
df['recent_champ_2_loyal'] = results.apply(lambda x: x['recent_champ_2_loyal']) | |
return df | |
def optimize_feature_dtypes(df): | |
""" | |
Optimize data types for feature columns using unsigned integers for non-negative values | |
""" | |
df = df.copy() | |
# Very small range integers (< 10 unique values) to uint8 (0 to 255) | |
category_cols = { | |
'region': 4, # 4 unique values | |
'team': 2, # 2 unique values | |
'champ_variety_score': 6, # 6 unique values | |
'playstyle': 6, # 6 unique values | |
'most_role_1': 5, # 5 unique values | |
'most_role_2': 5, # 5 unique values | |
'most_role_3': 5, # 5 unique values | |
'role_specialization': 5, # 5 unique values | |
'recent_champ_1_loyal':2, # 2 unique values | |
'recent_champ_2_loyal':2 # 2 unique values | |
} | |
for col, n_unique in category_cols.items(): | |
if col in df.columns: | |
if df[col].isna().any(): | |
# For columns with NaN, ensure proper handling | |
df[col] = df[col].astype('category') | |
# Fill NaN with a new category if needed | |
df[col] = df[col].cat.add_categories(['Unknown']).fillna('Unknown') | |
else: | |
df[col] = df[col].astype('category') # Regular unsigned integer | |
# Medium range integers (< 200 unique values) to UInt8 (0 to 255) | |
champion_cols = [ | |
'champion', # 168 unique | |
'team_champ1', # 149 unique | |
'team_champ2', # 154 unique | |
'team_champ3', # 143 unique | |
'team_champ4', # 140 unique | |
'opp_champ1', # 144 unique | |
'opp_champ2', # 82 unique | |
'opp_champ3', # 145 unique | |
'opp_champ4', # 119 unique | |
'opp_champ5', # 110 unique | |
'most_champ_1', # 138 unique | |
'most_champ_2', # 134 unique | |
'season_champ1', # 139 unique | |
'season_champ2', # 129 unique | |
'season_champ3', # 132 unique | |
'1_champ_name', # 114 unique | |
'2_champ_name', # 114 unique | |
'3_champ_name', # 112 unique | |
'4_champ_name', # 111 unique | |
'5_champ_name' # 113 unique | |
] | |
for col in champion_cols: | |
if col in df.columns: | |
df[col] = df[col].astype('UInt8') # All champion IDs can fit in UInt8 | |
# Float32 columns (performance metrics and ratios) | |
float32_cols = [ | |
'most_role_1_value', # 15 unique | |
'most_role_2_value', # 11 unique | |
'most_role_3_value', # 15 unique | |
'avg_kills', # 92 unique | |
'avg_deaths', # 58 unique | |
'avg_assists', # 132 unique | |
'kda_ratio_profile', # 286 unique | |
'kill_participation_profile', # 37 unique | |
'WR_1', # 64 unique | |
'WR_2', # 23 unique | |
'WR_3', # 10 unique | |
'champion_loyalty_score', # 156 unique | |
'loyalty_confidence_score' # 5 unique | |
] | |
for col in float32_cols: | |
if col in df.columns: | |
df[col] = df[col].astype('float32') | |
return df | |
def remove_unwanted_columns(df): | |
""" | |
Removes specified columns from the DataFrame | |
Args: | |
df (pd.DataFrame): Input DataFrame | |
Returns: | |
pd.DataFrame: DataFrame with specified columns removed | |
""" | |
df = df.copy() | |
# Define columns to remove | |
columns_to_remove = ( | |
# Time and basic stats | |
['date'] + | |
['total_games', 'wins', 'losses', 'win_rate'] + | |
['WR_1', 'WR_2', 'WR_3'] + | |
['most_champ_3'] + | |
['W_1', 'L_1', 'KDA_1', 'W_2', 'L_2', 'KDA_2', 'W_3', 'L_3', 'KDA_3'] + | |
# Roles | |
['TOP', 'JUNGLE', 'MID', 'ADC', 'SUPPORT'] + | |
# Season and weekly stats | |
['cs_ssn_1', 'cpm_ssn_1', 'kda_ssn_1', 'k_ssn_1', 'd_ssn_1', 'a_ssn_1', 'wr_ssn_1', 'games_ssn_1', | |
'cs_ssn_2', 'cpm_ssn_2', 'kda_ssn_2', 'k_ssn_2', 'd_ssn_2', 'a_ssn_2', 'wr_ssn_2', 'games_ssn_2', | |
'cs_ssn_3', 'cpm_ssn_3', 'kda_ssn_3', 'k_ssn_3', 'd_ssn_3', 'a_ssn_3', 'wr_ssn_3', 'games_ssn_3', | |
'season_champ_4', 'cs_ssn_4', 'cpm_ssn_4', 'kda_ssn_4', 'k_ssn_4', 'd_ssn_4', 'a_ssn_4', 'wr_ssn_4', 'games_ssn_4', | |
'season_champ_5', 'cs_ssn_5', 'cpm_ssn_5', 'kda_ssn_5', 'k_ssn_5', 'd_ssn_5', 'a_ssn_5', 'wr_ssn_5', 'games_ssn_5', | |
'season_champ_6', 'cs_ssn_6', 'cpm_ssn_6', 'kda_ssn_6', 'k_ssn_6', 'd_ssn_6', 'a_ssn_6', 'wr_ssn_6', 'games_ssn_6', | |
'season_champ_7', 'cs_ssn_7', 'cpm_ssn_7', 'kda_ssn_7', 'k_ssn_7', 'd_ssn_7', 'a_ssn_7', 'wr_ssn_7', 'games_ssn_7'] + | |
# Weekly stats | |
['7d_champ_1', '7d_total_1', '7d_WR_1', '7d_champ_2', '7d_total_2', '7d_WR_2', | |
'7d_champ_3', '7d_total_3', '7d_WR_3'] + | |
['7d_W_1', '7d_L_1', '7d_W_2', '7d_L_2', '7d_W_3', '7d_L_3'] + | |
# Mastery stats | |
['mastery_champ_1', 'm_lv_1', 'mastery_champ_2', 'm_lv_2', 'mastery_champ_3', 'm_lv_3', | |
'mastery_champ_4', 'm_lv_4', 'mastery_champ_5', 'm_lv_5', 'mastery_champ_6', 'm_lv_6', | |
'mastery_champ_7', 'm_lv_7', 'mastery_champ_8', 'm_lv_8', 'mastery_champ_9', 'm_lv_9', | |
'mastery_champ_10', 'm_lv_10', 'mastery_champ_11', 'm_lv_11', 'mastery_champ_12', 'm_lv_12', | |
'mastery_champ_13', 'm_lv_13', 'mastery_champ_14', 'm_lv_14', 'mastery_champ_15', 'm_lv_15', | |
'mastery_champ_16', 'm_lv_16'] + | |
# Champion scores and others | |
['1_champ_score', '2_champ_score', '3_champ_score', '4_champ_score', '5_champ_score'] + | |
['avg_tier', 'team'] + | |
# Champions individual score | |
["Aatrox", "Ahri", "Akali", "Akshan", "Alistar", "Ambessa", "Amumu", "Anivia", "Annie", "Aphelios", | |
"Ashe", "Aurelion Sol", "Aurora", "Azir", "Bard", "Bel'Veth", "Blitzcrank", "Brand", "Braum", | |
"Briar", "Caitlyn", "Camille", "Cassiopeia", "Cho'Gath", "Corki", "Darius", "Diana", "Dr. Mundo", | |
"Draven", "Ekko", "Elise", "Evelynn", "Ezreal", "Fiddlesticks", "Fiora", "Fizz", "Galio", | |
"Gangplank", "Garen", "Gnar", "Gragas", "Graves", "Gwen", "Hecarim", "Heimerdinger", "Hwei", | |
"Illaoi", "Irelia", "Ivern", "Janna", "Jarvan IV", "Jax", "Jayce", "Jhin", "Jinx", "K'Sante", | |
"Kai'Sa", "Kalista", "Karma", "Karthus", "Kassadin", "Katarina", "Kayle", "Kayn", "Kennen", | |
"Kha'Zix", "Kindred", "Kled", "Kog'Maw", "LeBlanc", "Lee Sin", "Leona", "Lillia", "Lissandra", | |
"Lucian", "Lulu", "Lux", "Malphite", "Malzahar", "Maokai", "Master Yi", "Milio", "Miss Fortune", | |
"Mordekaiser", "Morgana", "Naafiri", "Nami", "Nasus", "Nautilus", "Neeko", "Nidalee", "Nilah", | |
"Nocturne", "Nunu & Willump", "Olaf", "Orianna", "Ornn", "Pantheon", "Poppy", "Pyke", "Qiyana", | |
"Quinn", "Rakan", "Rammus", "Rek'Sai", "Rell", "Renata Glasc", "Renekton", "Rengar", "Riven", | |
"Rumble", "Ryze", "Samira", "Sejuani", "Senna", "Seraphine", "Sett", "Shaco", "Shen", "Shyvana", | |
"Singed", "Sion", "Sivir", "Skarner", "Smolder", "Sona", "Soraka", "Swain", "Sylas", "Syndra", | |
"Tahm Kench", "Taliyah", "Talon", "Taric", "Teemo", "Thresh", "Tristana", "Trundle", "Tryndamere", | |
"Twisted Fate", "Twitch", "Udyr", "Urgot", "Varus", "Vayne", "Veigar", "Vel'Koz", "Vex", "Vi", | |
"Viego", "Viktor", "Vladimir", "Volibear", "Warwick", "Wukong", "Xayah", "Xerath", "Xin Zhao", | |
"Yasuo", "Yone", "Yorick", "Yuumi", "Zac", "Zed", "Zeri", "Ziggs", "Zilean", "Zoe", "Zyra"] | |
) | |
# Remove columns that exist in the DataFrame | |
columns_to_remove = [col for col in columns_to_remove if col in df.columns] | |
# Drop the columns | |
df = df.drop(columns=columns_to_remove) | |
# Print info about removed columns | |
print(f"Removed {len(columns_to_remove)} columns") | |
print(f"Remaining columns: {len(df.columns)}") | |
return df | |
def apply_feature_engineering(df, n=5): | |
""" | |
Performs feature engineering pipeline | |
""" | |
df = df.copy() | |
# Engineering pipeline | |
transformations = [ | |
calculate_champ_variety_score, | |
calculate_playstyle, | |
get_most_role_3, | |
calculate_role_specialization, | |
calculate_champion_loyalty, | |
lambda x: get_top_champion_scores(x, n), # Add top 5 champions | |
remove_unwanted_columns, | |
optimize_feature_dtypes | |
] | |
for transform in transformations: | |
try: | |
print(f"Applying {transform.__name__}...") | |
df = transform(df) | |
except Exception as e: | |
print(f"Error in {transform.__name__}: {str(e)}") | |
raise | |
return df | |