James McCool
Refactor player similarity score calculation in predict_dupes function
1fe4ec0
raw
history blame
22.1 kB
import streamlit as st
import numpy as np
import pandas as pd
import time
from fuzzywuzzy import process
import math
from difflib import SequenceMatcher
def calculate_weighted_ownership(row_ownerships):
"""
Calculate weighted ownership based on the formula:
(AVERAGE of (each value's average with overall average)) * count - (max - min)
Args:
row_ownerships: Series containing ownership values in percentage form (e.g., 24.2213 for 24.2213%)
Returns:
float: Calculated weighted ownership value
"""
# Drop NaN values and convert percentages to decimals
row_ownerships = row_ownerships.dropna() / 100
# Get the mean of all ownership values
row_mean = row_ownerships.mean()
# Calculate average of each value with the overall mean
value_means = [(val + row_mean) / 2 for val in row_ownerships]
# Take average of all those means
avg_of_means = sum(value_means) / len(row_ownerships)
# Multiply by count of values
weighted = avg_of_means * (len(row_ownerships) * 1)
# Subtract (max - min)
weighted = weighted - (row_ownerships.max() - row_ownerships.min())
# Convert back to percentage form to match input format
return weighted * 10000
def calculate_player_similarity_score(portfolio, player_columns):
"""
Calculate a similarity score that measures how different each row is from all other rows
based on actual player selection. Converts players to numeric IDs for faster comparison.
Higher scores indicate more unique/different lineups.
Args:
portfolio: DataFrame containing the portfolio data
player_columns: List of column names containing player names
Returns:
Series: Similarity scores for each row
"""
# Extract player data
player_data = portfolio[player_columns].fillna('')
# Get all unique players and create a mapping to numeric IDs
all_players = set()
for col in player_columns:
unique_vals = player_data[col].unique()
for val in unique_vals:
if isinstance(val, str) and val.strip() != '':
all_players.add(val)
# Create player ID mapping
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
# Convert each row to a list of player IDs
row_ids = []
for _, row in player_data.iterrows():
# Get player IDs for this row, sorted for consistency
player_ids = sorted([player_to_id[str(val)] for val in row.values
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
row_ids.append(player_ids)
# Calculate similarity scores using Jaccard distance on player ID sets
similarity_scores = []
for i in range(len(portfolio)):
distances = []
for j in range(len(portfolio)):
if i != j:
# Convert to sets for Jaccard calculation
set_i = set(row_ids[i])
set_j = set(row_ids[j])
# Calculate Jaccard distance
if len(set_i) == 0 and len(set_j) == 0:
# Both lineups are empty
distance = 0.0
elif len(set_i) == 0 or len(set_j) == 0:
# One lineup is empty, other is not
distance = 1.0
else:
# Jaccard distance = 1 - (intersection / union)
intersection = len(set_i & set_j)
union = len(set_i | set_j)
distance = 1 - (intersection / union)
distances.append(distance)
# Average distance to all other lineups
avg_distance = np.mean(distances) if distances else 0
similarity_scores.append(avg_distance)
# Normalize to 0-1 scale where 1 = most unique/different
similarity_scores = np.array(similarity_scores)
if similarity_scores.max() > similarity_scores.min():
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
return similarity_scores
def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
if strength_var == 'Weak':
dupes_multiplier = .75
percentile_multiplier = .90
elif strength_var == 'Average':
dupes_multiplier = 1.00
percentile_multiplier = 1.00
elif strength_var == 'Sharp':
dupes_multiplier = 1.25
percentile_multiplier = 1.10
max_ownership = max(maps_dict['own_map'].values()) / 100
average_ownership = np.mean(list(maps_dict['own_map'].values())) / 100
if site_var == 'Fanduel':
if type_var == 'Showdown':
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
# Get the original player columns (first 5 columns excluding salary, median, Own)
player_columns = [col for col in portfolio.columns[:5] if col not in ['salary', 'median', 'Own']]
flex_ownerships = pd.concat([
portfolio.iloc[:,1].map(maps_dict['own_map']),
portfolio.iloc[:,2].map(maps_dict['own_map']),
portfolio.iloc[:,3].map(maps_dict['own_map']),
portfolio.iloc[:,4].map(maps_dict['own_map'])
])
flex_rank = flex_ownerships.rank(pct=True)
# Assign ranks back to individual columns using the same rank scale
portfolio['CPT_Own_percent_rank'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']).rank(pct=True)
portfolio['FLEX1_Own_percent_rank'] = portfolio.iloc[:,1].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX2_Own_percent_rank'] = portfolio.iloc[:,2].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX3_Own_percent_rank'] = portfolio.iloc[:,3].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX4_Own_percent_rank'] = portfolio.iloc[:,4].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['CPT_Own'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']) / 100
portfolio['FLEX1_Own'] = portfolio.iloc[:,1].map(maps_dict['own_map']) / 100
portfolio['FLEX2_Own'] = portfolio.iloc[:,2].map(maps_dict['own_map']) / 100
portfolio['FLEX3_Own'] = portfolio.iloc[:,3].map(maps_dict['own_map']) / 100
portfolio['FLEX4_Own'] = portfolio.iloc[:,4].map(maps_dict['own_map']) / 100
portfolio['own_product'] = (portfolio[own_columns].product(axis=1))
portfolio['own_average'] = (portfolio['Own'].max() * .33) / 100
portfolio['own_sum'] = portfolio[own_columns].sum(axis=1)
portfolio['avg_own_rank'] = portfolio[dup_count_columns].mean(axis=1)
# Calculate dupes formula
portfolio['dupes_calc'] = (portfolio['own_product'] * portfolio['avg_own_rank']) * Contest_Size + ((portfolio['salary'] - (60000 - portfolio['Own'])) / 100) - ((60000 - portfolio['salary']) / 100)
portfolio['dupes_calc'] = portfolio['dupes_calc'] * dupes_multiplier
# Round and handle negative values
portfolio['Dupes'] = np.where(
np.round(portfolio['dupes_calc'], 0) <= 0,
0,
np.round(portfolio['dupes_calc'], 0) - 1
)
elif type_var == 'Classic':
num_players = len([col for col in portfolio.columns if col not in ['salary', 'median', 'Own']])
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
# Get the original player columns (first num_players columns excluding salary, median, Own)
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
for i in range(1, num_players + 1):
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
portfolio['own_product'] = (portfolio[own_columns].product(axis=1))
portfolio['own_average'] = (portfolio['Own'].max() * .33) / 100
portfolio['own_sum'] = portfolio[own_columns].sum(axis=1)
portfolio['avg_own_rank'] = portfolio[dup_count_columns].mean(axis=1)
portfolio['dupes_calc'] = (portfolio['own_product'] * portfolio['avg_own_rank']) * Contest_Size + ((portfolio['salary'] - (60000 - portfolio['Own'])) / 100) - ((60000 - portfolio['salary']) / 100)
portfolio['dupes_calc'] = portfolio['dupes_calc'] * dupes_multiplier
# Round and handle negative values
portfolio['Dupes'] = np.where(
np.round(portfolio['dupes_calc'], 0) <= 0,
0,
np.round(portfolio['dupes_calc'], 0) - 1
)
elif site_var == 'Draftkings':
if type_var == 'Showdown':
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
# Get the original player columns (first 6 columns excluding salary, median, Own)
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
flex_ownerships = pd.concat([
portfolio.iloc[:,1].map(maps_dict['own_map']),
portfolio.iloc[:,2].map(maps_dict['own_map']),
portfolio.iloc[:,3].map(maps_dict['own_map']),
portfolio.iloc[:,4].map(maps_dict['own_map']),
portfolio.iloc[:,5].map(maps_dict['own_map'])
])
flex_rank = flex_ownerships.rank(pct=True)
# Assign ranks back to individual columns using the same rank scale
portfolio['CPT_Own_percent_rank'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']).rank(pct=True)
portfolio['FLEX1_Own_percent_rank'] = portfolio.iloc[:,1].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX2_Own_percent_rank'] = portfolio.iloc[:,2].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX3_Own_percent_rank'] = portfolio.iloc[:,3].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX4_Own_percent_rank'] = portfolio.iloc[:,4].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX5_Own_percent_rank'] = portfolio.iloc[:,5].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['CPT_Own'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']) / 100
portfolio['FLEX1_Own'] = portfolio.iloc[:,1].map(maps_dict['own_map']) / 100
portfolio['FLEX2_Own'] = portfolio.iloc[:,2].map(maps_dict['own_map']) / 100
portfolio['FLEX3_Own'] = portfolio.iloc[:,3].map(maps_dict['own_map']) / 100
portfolio['FLEX4_Own'] = portfolio.iloc[:,4].map(maps_dict['own_map']) / 100
portfolio['FLEX5_Own'] = portfolio.iloc[:,5].map(maps_dict['own_map']) / 100
portfolio['own_product'] = (portfolio[own_columns].product(axis=1))
portfolio['own_average'] = (portfolio['Own'].max() * .33) / 100
portfolio['own_sum'] = portfolio[own_columns].sum(axis=1)
portfolio['avg_own_rank'] = portfolio[dup_count_columns].mean(axis=1)
# Calculate dupes formula
portfolio['dupes_calc'] = (portfolio['own_product'] * portfolio['avg_own_rank']) * Contest_Size + ((portfolio['salary'] - (50000 - portfolio['Own'])) / 100) - ((50000 - portfolio['salary']) / 100)
portfolio['dupes_calc'] = portfolio['dupes_calc'] * dupes_multiplier
# Round and handle negative values
portfolio['Dupes'] = np.where(
np.round(portfolio['dupes_calc'], 0) <= 0,
0,
np.round(portfolio['dupes_calc'], 0) - 1
)
elif type_var == 'Classic':
if sport_var == 'CS2':
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
# Get the original player columns (first 6 columns excluding salary, median, Own)
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
flex_ownerships = pd.concat([
portfolio.iloc[:,1].map(maps_dict['own_map']),
portfolio.iloc[:,2].map(maps_dict['own_map']),
portfolio.iloc[:,3].map(maps_dict['own_map']),
portfolio.iloc[:,4].map(maps_dict['own_map']),
portfolio.iloc[:,5].map(maps_dict['own_map'])
])
flex_rank = flex_ownerships.rank(pct=True)
# Assign ranks back to individual columns using the same rank scale
portfolio['CPT_Own_percent_rank'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']).rank(pct=True)
portfolio['FLEX1_Own_percent_rank'] = portfolio.iloc[:,1].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX2_Own_percent_rank'] = portfolio.iloc[:,2].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX3_Own_percent_rank'] = portfolio.iloc[:,3].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX4_Own_percent_rank'] = portfolio.iloc[:,4].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['FLEX5_Own_percent_rank'] = portfolio.iloc[:,5].map(maps_dict['own_map']).map(lambda x: flex_rank[flex_ownerships == x].iloc[0])
portfolio['CPT_Own'] = portfolio.iloc[:,0].map(maps_dict['cpt_own_map']) / 100
portfolio['FLEX1_Own'] = portfolio.iloc[:,1].map(maps_dict['own_map']) / 100
portfolio['FLEX2_Own'] = portfolio.iloc[:,2].map(maps_dict['own_map']) / 100
portfolio['FLEX3_Own'] = portfolio.iloc[:,3].map(maps_dict['own_map']) / 100
portfolio['FLEX4_Own'] = portfolio.iloc[:,4].map(maps_dict['own_map']) / 100
portfolio['FLEX5_Own'] = portfolio.iloc[:,5].map(maps_dict['own_map']) / 100
portfolio['own_product'] = (portfolio[own_columns].product(axis=1))
portfolio['own_average'] = (portfolio['Own'].max() * .33) / 100
portfolio['own_sum'] = portfolio[own_columns].sum(axis=1)
portfolio['avg_own_rank'] = portfolio[dup_count_columns].mean(axis=1)
# Calculate dupes formula
portfolio['dupes_calc'] = (portfolio['own_product'] * portfolio['avg_own_rank']) * Contest_Size + ((portfolio['salary'] - (50000 - portfolio['Own'])) / 100) - ((50000 - portfolio['salary']) / 100)
portfolio['dupes_calc'] = portfolio['dupes_calc'] * dupes_multiplier
# Round and handle negative values
portfolio['Dupes'] = np.where(
np.round(portfolio['dupes_calc'], 0) <= 0,
0,
np.round(portfolio['dupes_calc'], 0) - 1
)
elif sport_var != 'CS2':
num_players = len([col for col in portfolio.columns if col not in ['salary', 'median', 'Own']])
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
# Get the original player columns (first num_players columns excluding salary, median, Own)
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
for i in range(1, num_players + 1):
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
portfolio['own_product'] = (portfolio[own_columns].product(axis=1))
portfolio['own_average'] = (portfolio['Own'].max() * .33) / 100
portfolio['own_sum'] = portfolio[own_columns].sum(axis=1)
portfolio['avg_own_rank'] = portfolio[dup_count_columns].mean(axis=1)
portfolio['dupes_calc'] = (portfolio['own_product'] * portfolio['avg_own_rank']) * Contest_Size + ((portfolio['salary'] - (50000 - portfolio['Own'])) / 100) - ((50000 - portfolio['salary']) / 100)
portfolio['dupes_calc'] = portfolio['dupes_calc'] * dupes_multiplier
# Round and handle negative values
portfolio['Dupes'] = np.where(
np.round(portfolio['dupes_calc'], 0) <= 0,
0,
np.round(portfolio['dupes_calc'], 0) - 1
)
portfolio['Dupes'] = np.round(portfolio['Dupes'], 0)
portfolio['own_ratio'] = np.where(
portfolio[own_columns].isin([max_ownership]).any(axis=1),
portfolio['own_sum'] / portfolio['own_average'],
(portfolio['own_sum'] - max_ownership) / portfolio['own_average']
)
percentile_cut_scalar = portfolio['median'].max() # Get scalar value
if type_var == 'Classic':
if sport_var == 'CS2':
own_ratio_nerf = 2
elif sport_var != 'CS2':
own_ratio_nerf = 1.5
elif type_var == 'Showdown':
own_ratio_nerf = 1.5
portfolio['Finish_percentile'] = portfolio.apply(
lambda row: .0005 if (row['own_ratio'] - own_ratio_nerf) / ((10 * (row['median'] / percentile_cut_scalar)) / 2) < .0005
else (row['own_ratio'] - own_ratio_nerf) / ((10 * (row['median'] / percentile_cut_scalar)) / 2),
axis=1
)
portfolio['Ref_Proj'] = portfolio['median'].max()
portfolio['Max_Proj'] = portfolio['Ref_Proj'] + 10
portfolio['Min_Proj'] = portfolio['Ref_Proj'] - 10
portfolio['Avg_Ref'] = (portfolio['Max_Proj'] + portfolio['Min_Proj']) / 2
portfolio['Win%'] = (((portfolio['median'] / portfolio['Avg_Ref']) - (0.1 + ((portfolio['Ref_Proj'] - portfolio['median'])/100))) / (Contest_Size / 1000)) / 10
max_allowed_win = (1 / Contest_Size) * 5
portfolio['Win%'] = portfolio['Win%'] / portfolio['Win%'].max() * max_allowed_win
portfolio['Finish_percentile'] = portfolio['Finish_percentile'] + .005 + (.005 * (Contest_Size / 10000))
portfolio['Finish_percentile'] = portfolio['Finish_percentile'] * percentile_multiplier
portfolio['Win%'] = portfolio['Win%'] * (1 - portfolio['Finish_percentile'])
portfolio['low_own_count'] = portfolio[own_columns].apply(lambda row: (row < 0.10).sum(), axis=1)
portfolio['Finish_percentile'] = portfolio.apply(lambda row: row['Finish_percentile'] if row['low_own_count'] <= 0 else row['Finish_percentile'] / row['low_own_count'], axis=1)
portfolio['Lineup Edge'] = portfolio['Win%'] * ((.5 - portfolio['Finish_percentile']) * (Contest_Size / 2.5))
portfolio['Lineup Edge'] = portfolio.apply(lambda row: row['Lineup Edge'] / (row['Dupes'] + 1) if row['Dupes'] > 0 else row['Lineup Edge'], axis=1)
portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
# Calculate similarity score based on actual player selection
portfolio['Similarity Score'] = calculate_player_similarity_score(portfolio, player_columns)
portfolio = portfolio.drop(columns=dup_count_columns)
portfolio = portfolio.drop(columns=own_columns)
portfolio = portfolio.drop(columns=calc_columns)
return portfolio