James McCool commited on
Commit
4390bf0
·
1 Parent(s): c283108

Refactor player similarity score calculation in predict_dupes function

Browse files

This update enhances the calculate_player_similarity_score function by replacing the previous Jaccard distance method with SequenceMatcher for improved accuracy in measuring lineup similarity. Each lineup is now represented as a string, allowing for a more efficient comparison process. This change simplifies the logic and ensures consistent handling of player selections.

Files changed (1) hide show
  1. global_func/predict_dupes.py +16 -38
global_func/predict_dupes.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import time
5
  from fuzzywuzzy import process
6
  import math
 
7
 
8
  def calculate_weighted_ownership(row_ownerships):
9
  """
@@ -40,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
40
  def calculate_player_similarity_score(portfolio, player_columns):
41
  """
42
  Calculate a similarity score that measures how different each row is from all other rows
43
- based on actual player selection (not ownership values).
44
  Higher scores indicate more unique/different lineups.
45
 
46
  Args:
@@ -50,52 +51,29 @@ def calculate_player_similarity_score(portfolio, player_columns):
50
  Returns:
51
  Series: Similarity scores for each row
52
  """
53
- # Extract player data and create a matrix where each row represents a lineup
54
  player_data = portfolio[player_columns].fillna('')
55
 
56
- # Get all unique players across all lineups
57
- all_players = set()
58
- for col in player_columns:
59
- # Only add string values (player names), skip numeric values
60
- unique_vals = player_data[col].unique()
61
- for val in unique_vals:
62
- if isinstance(val, str) and val.strip() != '':
63
- all_players.add(val)
64
 
65
- # Convert to sorted list
66
- all_players = sorted(list(all_players))
67
-
68
- # If no valid players found, return zeros
69
- if len(all_players) == 0:
70
- return np.zeros(len(portfolio))
71
-
72
- # Create a binary matrix: 1 if player is in lineup, 0 if not
73
- binary_matrix = np.zeros((len(portfolio), len(all_players)))
74
-
75
- for i, row in player_data.iterrows():
76
- for j, player in enumerate(all_players):
77
- if player in row.values:
78
- binary_matrix[i, j] = 1
79
-
80
- # Calculate Jaccard distance between all pairs of lineups
81
- # Jaccard distance = 1 - (intersection / union)
82
  similarity_scores = []
83
 
84
  for i in range(len(portfolio)):
85
  distances = []
86
  for j in range(len(portfolio)):
87
  if i != j:
88
- # Calculate intersection and union
89
- intersection = np.sum((binary_matrix[i] == 1) & (binary_matrix[j] == 1))
90
- union = np.sum((binary_matrix[i] == 1) | (binary_matrix[j] == 1))
91
-
92
- # Avoid division by zero
93
- if union == 0:
94
- jaccard_distance = 1.0 # Completely different if both are empty
95
- else:
96
- jaccard_distance = 1 - (intersection / union)
97
-
98
- distances.append(jaccard_distance)
99
 
100
  # Average distance to all other lineups
101
  avg_distance = np.mean(distances) if distances else 0
 
4
  import time
5
  from fuzzywuzzy import process
6
  import math
7
+ from difflib import SequenceMatcher
8
 
9
  def calculate_weighted_ownership(row_ownerships):
10
  """
 
41
  def calculate_player_similarity_score(portfolio, player_columns):
42
  """
43
  Calculate a similarity score that measures how different each row is from all other rows
44
+ based on actual player selection. Converts each row to a string and uses SequenceMatcher.
45
  Higher scores indicate more unique/different lineups.
46
 
47
  Args:
 
51
  Returns:
52
  Series: Similarity scores for each row
53
  """
54
+ # Extract player data and convert each row to a string
55
  player_data = portfolio[player_columns].fillna('')
56
 
57
+ # Convert each row to a string representation
58
+ row_strings = []
59
+ for _, row in player_data.iterrows():
60
+ # Sort the players to ensure consistent ordering
61
+ players = sorted([str(val) for val in row.values if str(val).strip() != ''])
62
+ row_string = '|'.join(players) # Use pipe as separator
63
+ row_strings.append(row_string)
 
64
 
65
+ # Calculate similarity scores using SequenceMatcher
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  similarity_scores = []
67
 
68
  for i in range(len(portfolio)):
69
  distances = []
70
  for j in range(len(portfolio)):
71
  if i != j:
72
+ # Use SequenceMatcher to compare the two row strings
73
+ similarity_ratio = SequenceMatcher(None, row_strings[i], row_strings[j]).ratio()
74
+ # Convert similarity to distance (1 - similarity)
75
+ distance = 1 - similarity_ratio
76
+ distances.append(distance)
 
 
 
 
 
 
77
 
78
  # Average distance to all other lineups
79
  avg_distance = np.mean(distances) if distances else 0