James McCool commited on
Commit
dd908a8
·
1 Parent(s): 42199ca

Add player similarity score calculation to predict_dupes function

Browse files

This update introduces a new function, calculate_player_similarity_score, which computes a similarity score for each lineup based on player selections. The scores are normalized to a 0-1 scale, where higher values indicate more unique lineups. Additionally, player_columns are dynamically defined based on the portfolio structure to ensure accurate processing of player data.

Files changed (1) hide show
  1. global_func/predict_dupes.py +69 -0
global_func/predict_dupes.py CHANGED
@@ -37,6 +37,67 @@ def calculate_weighted_ownership(row_ownerships):
37
  # Convert back to percentage form to match input format
38
  return weighted * 10000
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
41
  if strength_var == 'Weak':
42
  dupes_multiplier = .75
@@ -54,6 +115,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
54
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
55
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
56
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
 
57
  flex_ownerships = pd.concat([
58
  portfolio.iloc[:,1].map(maps_dict['own_map']),
59
  portfolio.iloc[:,2].map(maps_dict['own_map']),
@@ -95,6 +157,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
95
  dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
96
  own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
97
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
 
98
  for i in range(1, num_players + 1):
99
  portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
100
  portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
@@ -118,6 +181,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
118
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
119
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
120
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
 
121
  flex_ownerships = pd.concat([
122
  portfolio.iloc[:,1].map(maps_dict['own_map']),
123
  portfolio.iloc[:,2].map(maps_dict['own_map']),
@@ -162,6 +226,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
162
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
163
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
164
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
 
165
  flex_ownerships = pd.concat([
166
  portfolio.iloc[:,1].map(maps_dict['own_map']),
167
  portfolio.iloc[:,2].map(maps_dict['own_map']),
@@ -206,6 +271,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
206
  dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
207
  own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
208
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
 
209
  for i in range(1, num_players + 1):
210
  portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
211
  portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
@@ -263,6 +329,9 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
263
  portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
264
  portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
265
  portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
 
 
 
266
  portfolio = portfolio.drop(columns=dup_count_columns)
267
  portfolio = portfolio.drop(columns=own_columns)
268
  portfolio = portfolio.drop(columns=calc_columns)
 
37
  # Convert back to percentage form to match input format
38
  return weighted * 10000
39
 
40
+ def calculate_player_similarity_score(portfolio, player_columns):
41
+ """
42
+ Calculate a similarity score that measures how different each row is from all other rows
43
+ based on actual player selection (not ownership values).
44
+ Higher scores indicate more unique/different lineups.
45
+
46
+ Args:
47
+ portfolio: DataFrame containing the portfolio data
48
+ player_columns: List of column names containing player names
49
+
50
+ Returns:
51
+ Series: Similarity scores for each row
52
+ """
53
+ # Extract player data and create a matrix where each row represents a lineup
54
+ player_data = portfolio[player_columns].fillna('')
55
+
56
+ # Get all unique players across all lineups
57
+ all_players = set()
58
+ for col in player_columns:
59
+ all_players.update(player_data[col].unique())
60
+ all_players = sorted(list(all_players))
61
+
62
+ # Create a binary matrix: 1 if player is in lineup, 0 if not
63
+ binary_matrix = np.zeros((len(portfolio), len(all_players)))
64
+
65
+ for i, row in player_data.iterrows():
66
+ for j, player in enumerate(all_players):
67
+ if player in row.values:
68
+ binary_matrix[i, j] = 1
69
+
70
+ # Calculate Jaccard distance between all pairs of lineups
71
+ # Jaccard distance = 1 - (intersection / union)
72
+ similarity_scores = []
73
+
74
+ for i in range(len(portfolio)):
75
+ distances = []
76
+ for j in range(len(portfolio)):
77
+ if i != j:
78
+ # Calculate intersection and union
79
+ intersection = np.sum((binary_matrix[i] == 1) & (binary_matrix[j] == 1))
80
+ union = np.sum((binary_matrix[i] == 1) | (binary_matrix[j] == 1))
81
+
82
+ # Avoid division by zero
83
+ if union == 0:
84
+ jaccard_distance = 1.0 # Completely different if both are empty
85
+ else:
86
+ jaccard_distance = 1 - (intersection / union)
87
+
88
+ distances.append(jaccard_distance)
89
+
90
+ # Average distance to all other lineups
91
+ avg_distance = np.mean(distances) if distances else 0
92
+ similarity_scores.append(avg_distance)
93
+
94
+ # Normalize to 0-1 scale where 1 = most unique/different
95
+ similarity_scores = np.array(similarity_scores)
96
+ if similarity_scores.max() > similarity_scores.min():
97
+ similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
98
+
99
+ return similarity_scores
100
+
101
  def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
102
  if strength_var == 'Weak':
103
  dupes_multiplier = .75
 
115
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
116
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
117
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
118
+ player_columns = [col for col in portfolio.columns[:5] if col not in ['salary', 'median', 'Own']]
119
  flex_ownerships = pd.concat([
120
  portfolio.iloc[:,1].map(maps_dict['own_map']),
121
  portfolio.iloc[:,2].map(maps_dict['own_map']),
 
157
  dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
158
  own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
159
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
160
+ player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
161
  for i in range(1, num_players + 1):
162
  portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
163
  portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
 
181
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
182
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
183
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
184
+ player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
185
  flex_ownerships = pd.concat([
186
  portfolio.iloc[:,1].map(maps_dict['own_map']),
187
  portfolio.iloc[:,2].map(maps_dict['own_map']),
 
226
  dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
227
  own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
228
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
229
+ player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
230
  flex_ownerships = pd.concat([
231
  portfolio.iloc[:,1].map(maps_dict['own_map']),
232
  portfolio.iloc[:,2].map(maps_dict['own_map']),
 
271
  dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
272
  own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
273
  calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
274
+ player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
275
  for i in range(1, num_players + 1):
276
  portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
277
  portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
 
329
  portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
330
  portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
331
  portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
332
+
333
+ portfolio['Similarity Score'] = calculate_player_similarity_score(portfolio, player_columns)
334
+
335
  portfolio = portfolio.drop(columns=dup_count_columns)
336
  portfolio = portfolio.drop(columns=own_columns)
337
  portfolio = portfolio.drop(columns=calc_columns)