James McCool commited on
Commit
1fe4ec0
·
1 Parent(s): 0b88791

Refactor player similarity score calculation in predict_dupes function

Browse files

This update enhances the calculate_player_similarity_score function by converting player selections to numeric IDs for faster comparison. The similarity score calculation now uses Jaccard distance instead of SequenceMatcher, improving efficiency and accuracy in measuring lineup differences. Additionally, the logic for handling player data has been streamlined for better performance.

Files changed (1) hide show
  1. global_func/predict_dupes.py +37 -13
global_func/predict_dupes.py CHANGED
@@ -41,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
41
  def calculate_player_similarity_score(portfolio, player_columns):
42
  """
43
  Calculate a similarity score that measures how different each row is from all other rows
44
- based on actual player selection. Converts each row to a string and uses SequenceMatcher.
45
  Higher scores indicate more unique/different lineups.
46
 
47
  Args:
@@ -51,28 +51,52 @@ def calculate_player_similarity_score(portfolio, player_columns):
51
  Returns:
52
  Series: Similarity scores for each row
53
  """
54
- # Extract player data and convert each row to a string
55
  player_data = portfolio[player_columns].fillna('')
56
 
57
- # Convert each row to a string representation
58
- row_strings = []
 
 
 
 
 
 
 
 
 
 
 
59
  for _, row in player_data.iterrows():
60
- # Sort the players to ensure consistent ordering
61
- players = sorted([str(val) for val in row.values if str(val).strip() != ''])
62
- row_string = '|'.join(players) # Use pipe as separator
63
- row_strings.append(row_string)
64
 
65
- # Calculate similarity scores using SequenceMatcher
66
  similarity_scores = []
67
 
68
  for i in range(len(portfolio)):
69
  distances = []
70
  for j in range(len(portfolio)):
71
  if i != j:
72
- # Use SequenceMatcher to compare the two row strings
73
- similarity_ratio = SequenceMatcher(None, row_strings[i], row_strings[j]).ratio()
74
- # Convert similarity to distance (1 - similarity)
75
- distance = 1 - similarity_ratio
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  distances.append(distance)
77
 
78
  # Average distance to all other lineups
 
41
  def calculate_player_similarity_score(portfolio, player_columns):
42
  """
43
  Calculate a similarity score that measures how different each row is from all other rows
44
+ based on actual player selection. Converts players to numeric IDs for faster comparison.
45
  Higher scores indicate more unique/different lineups.
46
 
47
  Args:
 
51
  Returns:
52
  Series: Similarity scores for each row
53
  """
54
+ # Extract player data
55
  player_data = portfolio[player_columns].fillna('')
56
 
57
+ # Get all unique players and create a mapping to numeric IDs
58
+ all_players = set()
59
+ for col in player_columns:
60
+ unique_vals = player_data[col].unique()
61
+ for val in unique_vals:
62
+ if isinstance(val, str) and val.strip() != '':
63
+ all_players.add(val)
64
+
65
+ # Create player ID mapping
66
+ player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
67
+
68
+ # Convert each row to a list of player IDs
69
+ row_ids = []
70
  for _, row in player_data.iterrows():
71
+ # Get player IDs for this row, sorted for consistency
72
+ player_ids = sorted([player_to_id[str(val)] for val in row.values
73
+ if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
74
+ row_ids.append(player_ids)
75
 
76
+ # Calculate similarity scores using Jaccard distance on player ID sets
77
  similarity_scores = []
78
 
79
  for i in range(len(portfolio)):
80
  distances = []
81
  for j in range(len(portfolio)):
82
  if i != j:
83
+ # Convert to sets for Jaccard calculation
84
+ set_i = set(row_ids[i])
85
+ set_j = set(row_ids[j])
86
+
87
+ # Calculate Jaccard distance
88
+ if len(set_i) == 0 and len(set_j) == 0:
89
+ # Both lineups are empty
90
+ distance = 0.0
91
+ elif len(set_i) == 0 or len(set_j) == 0:
92
+ # One lineup is empty, other is not
93
+ distance = 1.0
94
+ else:
95
+ # Jaccard distance = 1 - (intersection / union)
96
+ intersection = len(set_i & set_j)
97
+ union = len(set_i | set_j)
98
+ distance = 1 - (intersection / union)
99
+
100
  distances.append(distance)
101
 
102
  # Average distance to all other lineups