James McCool
commited on
Commit
·
4390bf0
1
Parent(s):
c283108
Refactor player similarity score calculation in predict_dupes function
Browse filesThis update enhances the calculate_player_similarity_score function by replacing the previous Jaccard distance method with SequenceMatcher for improved accuracy in measuring lineup similarity. Each lineup is now represented as a string, allowing for a more efficient comparison process. This change simplifies the logic and ensures consistent handling of player selections.
- global_func/predict_dupes.py +16 -38
global_func/predict_dupes.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
import time
|
5 |
from fuzzywuzzy import process
|
6 |
import math
|
|
|
7 |
|
8 |
def calculate_weighted_ownership(row_ownerships):
|
9 |
"""
|
@@ -40,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
|
|
40 |
def calculate_player_similarity_score(portfolio, player_columns):
|
41 |
"""
|
42 |
Calculate a similarity score that measures how different each row is from all other rows
|
43 |
-
based on actual player selection
|
44 |
Higher scores indicate more unique/different lineups.
|
45 |
|
46 |
Args:
|
@@ -50,52 +51,29 @@ def calculate_player_similarity_score(portfolio, player_columns):
|
|
50 |
Returns:
|
51 |
Series: Similarity scores for each row
|
52 |
"""
|
53 |
-
# Extract player data and
|
54 |
player_data = portfolio[player_columns].fillna('')
|
55 |
|
56 |
-
#
|
57 |
-
|
58 |
-
for
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
all_players.add(val)
|
64 |
|
65 |
-
#
|
66 |
-
all_players = sorted(list(all_players))
|
67 |
-
|
68 |
-
# If no valid players found, return zeros
|
69 |
-
if len(all_players) == 0:
|
70 |
-
return np.zeros(len(portfolio))
|
71 |
-
|
72 |
-
# Create a binary matrix: 1 if player is in lineup, 0 if not
|
73 |
-
binary_matrix = np.zeros((len(portfolio), len(all_players)))
|
74 |
-
|
75 |
-
for i, row in player_data.iterrows():
|
76 |
-
for j, player in enumerate(all_players):
|
77 |
-
if player in row.values:
|
78 |
-
binary_matrix[i, j] = 1
|
79 |
-
|
80 |
-
# Calculate Jaccard distance between all pairs of lineups
|
81 |
-
# Jaccard distance = 1 - (intersection / union)
|
82 |
similarity_scores = []
|
83 |
|
84 |
for i in range(len(portfolio)):
|
85 |
distances = []
|
86 |
for j in range(len(portfolio)):
|
87 |
if i != j:
|
88 |
-
#
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
if union == 0:
|
94 |
-
jaccard_distance = 1.0 # Completely different if both are empty
|
95 |
-
else:
|
96 |
-
jaccard_distance = 1 - (intersection / union)
|
97 |
-
|
98 |
-
distances.append(jaccard_distance)
|
99 |
|
100 |
# Average distance to all other lineups
|
101 |
avg_distance = np.mean(distances) if distances else 0
|
|
|
4 |
import time
|
5 |
from fuzzywuzzy import process
|
6 |
import math
|
7 |
+
from difflib import SequenceMatcher
|
8 |
|
9 |
def calculate_weighted_ownership(row_ownerships):
|
10 |
"""
|
|
|
41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
42 |
"""
|
43 |
Calculate a similarity score that measures how different each row is from all other rows
|
44 |
+
based on actual player selection. Converts each row to a string and uses SequenceMatcher.
|
45 |
Higher scores indicate more unique/different lineups.
|
46 |
|
47 |
Args:
|
|
|
51 |
Returns:
|
52 |
Series: Similarity scores for each row
|
53 |
"""
|
54 |
+
# Extract player data and convert each row to a string
|
55 |
player_data = portfolio[player_columns].fillna('')
|
56 |
|
57 |
+
# Convert each row to a string representation
|
58 |
+
row_strings = []
|
59 |
+
for _, row in player_data.iterrows():
|
60 |
+
# Sort the players to ensure consistent ordering
|
61 |
+
players = sorted([str(val) for val in row.values if str(val).strip() != ''])
|
62 |
+
row_string = '|'.join(players) # Use pipe as separator
|
63 |
+
row_strings.append(row_string)
|
|
|
64 |
|
65 |
+
# Calculate similarity scores using SequenceMatcher
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
similarity_scores = []
|
67 |
|
68 |
for i in range(len(portfolio)):
|
69 |
distances = []
|
70 |
for j in range(len(portfolio)):
|
71 |
if i != j:
|
72 |
+
# Use SequenceMatcher to compare the two row strings
|
73 |
+
similarity_ratio = SequenceMatcher(None, row_strings[i], row_strings[j]).ratio()
|
74 |
+
# Convert similarity to distance (1 - similarity)
|
75 |
+
distance = 1 - similarity_ratio
|
76 |
+
distances.append(distance)
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Average distance to all other lineups
|
79 |
avg_distance = np.mean(distances) if distances else 0
|