James McCool commited on
Commit
039bb05
·
1 Parent(s): 1fe4ec0

Optimize player similarity score calculation in predict_dupes function

Browse files

This update enhances the calculate_player_similarity_score function by implementing vectorized operations for calculating Jaccard distances, significantly improving performance. The conversion of player selections to binary vectors allows for efficient pairwise distance computations, resulting in faster and more accurate similarity score calculations for lineups.

Files changed (1) hide show
  1. global_func/predict_dupes.py +37 -37
global_func/predict_dupes.py CHANGED
@@ -41,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
41
  def calculate_player_similarity_score(portfolio, player_columns):
42
  """
43
  Calculate a similarity score that measures how different each row is from all other rows
44
- based on actual player selection. Converts players to numeric IDs for faster comparison.
45
  Higher scores indicate more unique/different lineups.
46
 
47
  Args:
@@ -65,46 +65,46 @@ def calculate_player_similarity_score(portfolio, player_columns):
65
  # Create player ID mapping
66
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
67
 
68
- # Convert each row to a list of player IDs
69
- row_ids = []
70
- for _, row in player_data.iterrows():
71
- # Get player IDs for this row, sorted for consistency
72
- player_ids = sorted([player_to_id[str(val)] for val in row.values
73
- if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
74
- row_ids.append(player_ids)
75
 
76
- # Calculate similarity scores using Jaccard distance on player ID sets
77
- similarity_scores = []
 
 
78
 
79
- for i in range(len(portfolio)):
80
- distances = []
81
- for j in range(len(portfolio)):
82
- if i != j:
83
- # Convert to sets for Jaccard calculation
84
- set_i = set(row_ids[i])
85
- set_j = set(row_ids[j])
86
-
87
- # Calculate Jaccard distance
88
- if len(set_i) == 0 and len(set_j) == 0:
89
- # Both lineups are empty
90
- distance = 0.0
91
- elif len(set_i) == 0 or len(set_j) == 0:
92
- # One lineup is empty, other is not
93
- distance = 1.0
94
- else:
95
- # Jaccard distance = 1 - (intersection / union)
96
- intersection = len(set_i & set_j)
97
- union = len(set_i | set_j)
98
- distance = 1 - (intersection / union)
99
-
100
- distances.append(distance)
101
-
102
- # Average distance to all other lineups
103
- avg_distance = np.mean(distances) if distances else 0
104
- similarity_scores.append(avg_distance)
 
 
105
 
106
  # Normalize to 0-1 scale where 1 = most unique/different
107
- similarity_scores = np.array(similarity_scores)
108
  if similarity_scores.max() > similarity_scores.min():
109
  similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
110
 
 
41
  def calculate_player_similarity_score(portfolio, player_columns):
42
  """
43
  Calculate a similarity score that measures how different each row is from all other rows
44
+ based on actual player selection. Optimized for speed using vectorized operations.
45
  Higher scores indicate more unique/different lineups.
46
 
47
  Args:
 
65
  # Create player ID mapping
66
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
67
 
68
+ # Convert each row to a binary vector (1 if player is present, 0 if not)
69
+ n_players = len(all_players)
70
+ n_rows = len(portfolio)
71
+ binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
 
 
 
72
 
73
+ for i, (_, row) in enumerate(player_data.iterrows()):
74
+ for val in row.values:
75
+ if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
76
+ binary_matrix[i, player_to_id[str(val)]] = 1
77
 
78
+ # Vectorized Jaccard distance calculation
79
+ # Use matrix operations to compute all pairwise distances at once
80
+ similarity_scores = np.zeros(n_rows)
81
+
82
+ # Compute intersection and union matrices
83
+ # intersection[i,j] = number of players in common between row i and row j
84
+ # union[i,j] = total number of unique players between row i and row j
85
+ intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
86
+
87
+ # For union, we need: |A B| = |A| + |B| - |A ∩ B|
88
+ row_sums = np.sum(binary_matrix, axis=1)
89
+ union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
90
+
91
+ # Calculate Jaccard distance: 1 - (intersection / union)
92
+ # Avoid division by zero
93
+ with np.errstate(divide='ignore', invalid='ignore'):
94
+ jaccard_similarity = np.divide(intersection_matrix, union_matrix,
95
+ out=np.zeros_like(intersection_matrix, dtype=float),
96
+ where=union_matrix != 0)
97
+
98
+ # Convert similarity to distance and calculate average distance for each row
99
+ jaccard_distance = 1 - jaccard_similarity
100
+
101
+ # For each row, calculate average distance to all other rows
102
+ # Exclude self-comparison (diagonal elements)
103
+ np.fill_diagonal(jaccard_distance, 0)
104
+ row_counts = n_rows - 1 # Exclude self
105
+ similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
106
 
107
  # Normalize to 0-1 scale where 1 = most unique/different
 
108
  if similarity_scores.max() > similarity_scores.min():
109
  similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
110