James McCool
commited on
Commit
·
039bb05
1
Parent(s):
1fe4ec0
Optimize player similarity score calculation in predict_dupes function
Browse filesThis update enhances the calculate_player_similarity_score function by implementing vectorized operations for calculating Jaccard distances, significantly improving performance. The conversion of player selections to binary vectors allows for efficient pairwise distance computations, resulting in faster and more accurate similarity score calculations for lineups.
- global_func/predict_dupes.py +37 -37
global_func/predict_dupes.py
CHANGED
@@ -41,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
|
|
41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
42 |
"""
|
43 |
Calculate a similarity score that measures how different each row is from all other rows
|
44 |
-
based on actual player selection.
|
45 |
Higher scores indicate more unique/different lineups.
|
46 |
|
47 |
Args:
|
@@ -65,46 +65,46 @@ def calculate_player_similarity_score(portfolio, player_columns):
|
|
65 |
# Create player ID mapping
|
66 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
67 |
|
68 |
-
# Convert each row to a
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
player_ids = sorted([player_to_id[str(val)] for val in row.values
|
73 |
-
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
|
74 |
-
row_ids.append(player_ids)
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
|
106 |
# Normalize to 0-1 scale where 1 = most unique/different
|
107 |
-
similarity_scores = np.array(similarity_scores)
|
108 |
if similarity_scores.max() > similarity_scores.min():
|
109 |
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
|
110 |
|
|
|
41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
42 |
"""
|
43 |
Calculate a similarity score that measures how different each row is from all other rows
|
44 |
+
based on actual player selection. Optimized for speed using vectorized operations.
|
45 |
Higher scores indicate more unique/different lineups.
|
46 |
|
47 |
Args:
|
|
|
65 |
# Create player ID mapping
|
66 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
67 |
|
68 |
+
# Convert each row to a binary vector (1 if player is present, 0 if not)
|
69 |
+
n_players = len(all_players)
|
70 |
+
n_rows = len(portfolio)
|
71 |
+
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
|
|
|
|
|
|
72 |
|
73 |
+
for i, (_, row) in enumerate(player_data.iterrows()):
|
74 |
+
for val in row.values:
|
75 |
+
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
76 |
+
binary_matrix[i, player_to_id[str(val)]] = 1
|
77 |
|
78 |
+
# Vectorized Jaccard distance calculation
|
79 |
+
# Use matrix operations to compute all pairwise distances at once
|
80 |
+
similarity_scores = np.zeros(n_rows)
|
81 |
+
|
82 |
+
# Compute intersection and union matrices
|
83 |
+
# intersection[i,j] = number of players in common between row i and row j
|
84 |
+
# union[i,j] = total number of unique players between row i and row j
|
85 |
+
intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
|
86 |
+
|
87 |
+
# For union, we need: |A ∪ B| = |A| + |B| - |A ∩ B|
|
88 |
+
row_sums = np.sum(binary_matrix, axis=1)
|
89 |
+
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
|
90 |
+
|
91 |
+
# Calculate Jaccard distance: 1 - (intersection / union)
|
92 |
+
# Avoid division by zero
|
93 |
+
with np.errstate(divide='ignore', invalid='ignore'):
|
94 |
+
jaccard_similarity = np.divide(intersection_matrix, union_matrix,
|
95 |
+
out=np.zeros_like(intersection_matrix, dtype=float),
|
96 |
+
where=union_matrix != 0)
|
97 |
+
|
98 |
+
# Convert similarity to distance and calculate average distance for each row
|
99 |
+
jaccard_distance = 1 - jaccard_similarity
|
100 |
+
|
101 |
+
# For each row, calculate average distance to all other rows
|
102 |
+
# Exclude self-comparison (diagonal elements)
|
103 |
+
np.fill_diagonal(jaccard_distance, 0)
|
104 |
+
row_counts = n_rows - 1 # Exclude self
|
105 |
+
similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
|
106 |
|
107 |
# Normalize to 0-1 scale where 1 = most unique/different
|
|
|
108 |
if similarity_scores.max() > similarity_scores.min():
|
109 |
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
|
110 |
|