James McCool
commited on
Commit
·
dd908a8
1
Parent(s):
42199ca
Add player similarity score calculation to predict_dupes function
Browse filesThis update introduces a new function, calculate_player_similarity_score, which computes a similarity score for each lineup based on player selections. The scores are normalized to a 0-1 scale, where higher values indicate more unique lineups. Additionally, player_columns are dynamically defined based on the portfolio structure to ensure accurate processing of player data.
- global_func/predict_dupes.py +69 -0
global_func/predict_dupes.py
CHANGED
@@ -37,6 +37,67 @@ def calculate_weighted_ownership(row_ownerships):
|
|
37 |
# Convert back to percentage form to match input format
|
38 |
return weighted * 10000
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
|
41 |
if strength_var == 'Weak':
|
42 |
dupes_multiplier = .75
|
@@ -54,6 +115,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
54 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
|
55 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
|
56 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
57 |
flex_ownerships = pd.concat([
|
58 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
59 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
@@ -95,6 +157,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
95 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
96 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
97 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
98 |
for i in range(1, num_players + 1):
|
99 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
100 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
@@ -118,6 +181,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
118 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
119 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
120 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
121 |
flex_ownerships = pd.concat([
|
122 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
123 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
@@ -162,6 +226,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
162 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
163 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
164 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
165 |
flex_ownerships = pd.concat([
|
166 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
167 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
@@ -206,6 +271,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
206 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
207 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
208 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
209 |
for i in range(1, num_players + 1):
|
210 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
211 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
@@ -263,6 +329,9 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
263 |
portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
|
264 |
portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
|
265 |
portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
|
|
|
|
|
|
|
266 |
portfolio = portfolio.drop(columns=dup_count_columns)
|
267 |
portfolio = portfolio.drop(columns=own_columns)
|
268 |
portfolio = portfolio.drop(columns=calc_columns)
|
|
|
37 |
# Convert back to percentage form to match input format
|
38 |
return weighted * 10000
|
39 |
|
40 |
+
def calculate_player_similarity_score(portfolio, player_columns):
|
41 |
+
"""
|
42 |
+
Calculate a similarity score that measures how different each row is from all other rows
|
43 |
+
based on actual player selection (not ownership values).
|
44 |
+
Higher scores indicate more unique/different lineups.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
portfolio: DataFrame containing the portfolio data
|
48 |
+
player_columns: List of column names containing player names
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
Series: Similarity scores for each row
|
52 |
+
"""
|
53 |
+
# Extract player data and create a matrix where each row represents a lineup
|
54 |
+
player_data = portfolio[player_columns].fillna('')
|
55 |
+
|
56 |
+
# Get all unique players across all lineups
|
57 |
+
all_players = set()
|
58 |
+
for col in player_columns:
|
59 |
+
all_players.update(player_data[col].unique())
|
60 |
+
all_players = sorted(list(all_players))
|
61 |
+
|
62 |
+
# Create a binary matrix: 1 if player is in lineup, 0 if not
|
63 |
+
binary_matrix = np.zeros((len(portfolio), len(all_players)))
|
64 |
+
|
65 |
+
for i, row in player_data.iterrows():
|
66 |
+
for j, player in enumerate(all_players):
|
67 |
+
if player in row.values:
|
68 |
+
binary_matrix[i, j] = 1
|
69 |
+
|
70 |
+
# Calculate Jaccard distance between all pairs of lineups
|
71 |
+
# Jaccard distance = 1 - (intersection / union)
|
72 |
+
similarity_scores = []
|
73 |
+
|
74 |
+
for i in range(len(portfolio)):
|
75 |
+
distances = []
|
76 |
+
for j in range(len(portfolio)):
|
77 |
+
if i != j:
|
78 |
+
# Calculate intersection and union
|
79 |
+
intersection = np.sum((binary_matrix[i] == 1) & (binary_matrix[j] == 1))
|
80 |
+
union = np.sum((binary_matrix[i] == 1) | (binary_matrix[j] == 1))
|
81 |
+
|
82 |
+
# Avoid division by zero
|
83 |
+
if union == 0:
|
84 |
+
jaccard_distance = 1.0 # Completely different if both are empty
|
85 |
+
else:
|
86 |
+
jaccard_distance = 1 - (intersection / union)
|
87 |
+
|
88 |
+
distances.append(jaccard_distance)
|
89 |
+
|
90 |
+
# Average distance to all other lineups
|
91 |
+
avg_distance = np.mean(distances) if distances else 0
|
92 |
+
similarity_scores.append(avg_distance)
|
93 |
+
|
94 |
+
# Normalize to 0-1 scale where 1 = most unique/different
|
95 |
+
similarity_scores = np.array(similarity_scores)
|
96 |
+
if similarity_scores.max() > similarity_scores.min():
|
97 |
+
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
|
98 |
+
|
99 |
+
return similarity_scores
|
100 |
+
|
101 |
def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
|
102 |
if strength_var == 'Weak':
|
103 |
dupes_multiplier = .75
|
|
|
115 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
|
116 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
|
117 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
118 |
+
player_columns = [col for col in portfolio.columns[:5] if col not in ['salary', 'median', 'Own']]
|
119 |
flex_ownerships = pd.concat([
|
120 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
121 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
157 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
158 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
159 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
160 |
+
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
|
161 |
for i in range(1, num_players + 1):
|
162 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
163 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
|
181 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
182 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
183 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
184 |
+
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
|
185 |
flex_ownerships = pd.concat([
|
186 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
187 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
226 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
227 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
228 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
229 |
+
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
|
230 |
flex_ownerships = pd.concat([
|
231 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
232 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
271 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
272 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
273 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
274 |
+
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
|
275 |
for i in range(1, num_players + 1):
|
276 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
277 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
|
329 |
portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
|
330 |
portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
|
331 |
portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
|
332 |
+
|
333 |
+
portfolio['Similarity Score'] = calculate_player_similarity_score(portfolio, player_columns)
|
334 |
+
|
335 |
portfolio = portfolio.drop(columns=dup_count_columns)
|
336 |
portfolio = portfolio.drop(columns=own_columns)
|
337 |
portfolio = portfolio.drop(columns=calc_columns)
|