James McCool
commited on
Commit
·
1fe4ec0
1
Parent(s):
0b88791
Refactor player similarity score calculation in predict_dupes function
Browse filesThis update enhances the calculate_player_similarity_score function by converting player selections to numeric IDs for faster comparison. The similarity score calculation now uses Jaccard distance instead of SequenceMatcher, improving efficiency and accuracy in measuring lineup differences. Additionally, the logic for handling player data has been streamlined for better performance.
- global_func/predict_dupes.py +37 -13
global_func/predict_dupes.py
CHANGED
@@ -41,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
|
|
41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
42 |
"""
|
43 |
Calculate a similarity score that measures how different each row is from all other rows
|
44 |
-
based on actual player selection. Converts
|
45 |
Higher scores indicate more unique/different lineups.
|
46 |
|
47 |
Args:
|
@@ -51,28 +51,52 @@ def calculate_player_similarity_score(portfolio, player_columns):
|
|
51 |
Returns:
|
52 |
Series: Similarity scores for each row
|
53 |
"""
|
54 |
-
# Extract player data
|
55 |
player_data = portfolio[player_columns].fillna('')
|
56 |
|
57 |
-
#
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
for _, row in player_data.iterrows():
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
# Calculate similarity scores using
|
66 |
similarity_scores = []
|
67 |
|
68 |
for i in range(len(portfolio)):
|
69 |
distances = []
|
70 |
for j in range(len(portfolio)):
|
71 |
if i != j:
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
distances.append(distance)
|
77 |
|
78 |
# Average distance to all other lineups
|
|
|
41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
42 |
"""
|
43 |
Calculate a similarity score that measures how different each row is from all other rows
|
44 |
+
based on actual player selection. Converts players to numeric IDs for faster comparison.
|
45 |
Higher scores indicate more unique/different lineups.
|
46 |
|
47 |
Args:
|
|
|
51 |
Returns:
|
52 |
Series: Similarity scores for each row
|
53 |
"""
|
54 |
+
# Extract player data
|
55 |
player_data = portfolio[player_columns].fillna('')
|
56 |
|
57 |
+
# Get all unique players and create a mapping to numeric IDs
|
58 |
+
all_players = set()
|
59 |
+
for col in player_columns:
|
60 |
+
unique_vals = player_data[col].unique()
|
61 |
+
for val in unique_vals:
|
62 |
+
if isinstance(val, str) and val.strip() != '':
|
63 |
+
all_players.add(val)
|
64 |
+
|
65 |
+
# Create player ID mapping
|
66 |
+
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
67 |
+
|
68 |
+
# Convert each row to a list of player IDs
|
69 |
+
row_ids = []
|
70 |
for _, row in player_data.iterrows():
|
71 |
+
# Get player IDs for this row, sorted for consistency
|
72 |
+
player_ids = sorted([player_to_id[str(val)] for val in row.values
|
73 |
+
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
|
74 |
+
row_ids.append(player_ids)
|
75 |
|
76 |
+
# Calculate similarity scores using Jaccard distance on player ID sets
|
77 |
similarity_scores = []
|
78 |
|
79 |
for i in range(len(portfolio)):
|
80 |
distances = []
|
81 |
for j in range(len(portfolio)):
|
82 |
if i != j:
|
83 |
+
# Convert to sets for Jaccard calculation
|
84 |
+
set_i = set(row_ids[i])
|
85 |
+
set_j = set(row_ids[j])
|
86 |
+
|
87 |
+
# Calculate Jaccard distance
|
88 |
+
if len(set_i) == 0 and len(set_j) == 0:
|
89 |
+
# Both lineups are empty
|
90 |
+
distance = 0.0
|
91 |
+
elif len(set_i) == 0 or len(set_j) == 0:
|
92 |
+
# One lineup is empty, other is not
|
93 |
+
distance = 1.0
|
94 |
+
else:
|
95 |
+
# Jaccard distance = 1 - (intersection / union)
|
96 |
+
intersection = len(set_i & set_j)
|
97 |
+
union = len(set_i | set_j)
|
98 |
+
distance = 1 - (intersection / union)
|
99 |
+
|
100 |
distances.append(distance)
|
101 |
|
102 |
# Average distance to all other lineups
|