Spaces:

Multichem-PD
/

DFS_Portfolio_Manager

Running

James McCool commited on 6 days ago

Commit

1fe4ec0

1 Parent(s): 0b88791

Refactor player similarity score calculation in predict_dupes function

This update enhances the calculate_player_similarity_score function by converting player selections to numeric IDs for faster comparison. The similarity score calculation now uses Jaccard distance instead of SequenceMatcher, improving efficiency and accuracy in measuring lineup differences. Additionally, the logic for handling player data has been streamlined for better performance.

Files changed (1) hide show

global_func/predict_dupes.py +37 -13

global_func/predict_dupes.py CHANGED Viewed

@@ -41,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
 def calculate_player_similarity_score(portfolio, player_columns):
     """
     Calculate a similarity score that measures how different each row is from all other rows
-    based on actual player selection. Converts each row to a string and uses SequenceMatcher.
     Higher scores indicate more unique/different lineups.
     Args:
@@ -51,28 +51,52 @@ def calculate_player_similarity_score(portfolio, player_columns):
     Returns:
         Series: Similarity scores for each row
     """
-    # Extract player data and convert each row to a string
     player_data = portfolio[player_columns].fillna('')
-    # Convert each row to a string representation
-    row_strings = []
     for _, row in player_data.iterrows():
-        # Sort the players to ensure consistent ordering
-        players = sorted([str(val) for val in row.values if str(val).strip() != ''])
-        row_string = '|'.join(players)  # Use pipe as separator
-        row_strings.append(row_string)
-    # Calculate similarity scores using SequenceMatcher
     similarity_scores = []
     for i in range(len(portfolio)):
         distances = []
         for j in range(len(portfolio)):
             if i != j:
-                # Use SequenceMatcher to compare the two row strings
-                similarity_ratio = SequenceMatcher(None, row_strings[i], row_strings[j]).ratio()
-                # Convert similarity to distance (1 - similarity)
-                distance = 1 - similarity_ratio
                 distances.append(distance)
         # Average distance to all other lineups

 def calculate_player_similarity_score(portfolio, player_columns):
     """
     Calculate a similarity score that measures how different each row is from all other rows
+    based on actual player selection. Converts players to numeric IDs for faster comparison.
     Higher scores indicate more unique/different lineups.
     Args:
     Returns:
         Series: Similarity scores for each row
     """
+    # Extract player data
     player_data = portfolio[player_columns].fillna('')
+    # Get all unique players and create a mapping to numeric IDs
+    all_players = set()
+    for col in player_columns:
+        unique_vals = player_data[col].unique()
+        for val in unique_vals:
+            if isinstance(val, str) and val.strip() != '':
+                all_players.add(val)
+    # Create player ID mapping
+    player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
+    # Convert each row to a list of player IDs
+    row_ids = []
     for _, row in player_data.iterrows():
+        # Get player IDs for this row, sorted for consistency
+        player_ids = sorted([player_to_id[str(val)] for val in row.values
+                           if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id])
+        row_ids.append(player_ids)
+    # Calculate similarity scores using Jaccard distance on player ID sets
     similarity_scores = []
     for i in range(len(portfolio)):
         distances = []
         for j in range(len(portfolio)):
             if i != j:
+                # Convert to sets for Jaccard calculation
+                set_i = set(row_ids[i])
+                set_j = set(row_ids[j])
+                # Calculate Jaccard distance
+                if len(set_i) == 0 and len(set_j) == 0:
+                    # Both lineups are empty
+                    distance = 0.0
+                elif len(set_i) == 0 or len(set_j) == 0:
+                    # One lineup is empty, other is not
+                    distance = 1.0
+                else:
+                    # Jaccard distance = 1 - (intersection / union)
+                    intersection = len(set_i & set_j)
+                    union = len(set_i | set_j)
+                    distance = 1 - (intersection / union)
                 distances.append(distance)
         # Average distance to all other lineups