James McCool commited on
Commit
91e473e
·
1 Parent(s): 5de7ed9

Refactor name matching logic and update dependencies

Browse files

- Replaced the fuzzywuzzy library with rapidfuzz for improved performance in name matching operations.
- Removed the deprecated find_name_mismatches function and introduced a new get_contest_names function to streamline the retrieval of unique player names from contest data.
- Enhanced the load_contest_file function to utilize the new name matching logic, ensuring consistent player name handling across the application.
- Maintained existing functionality while improving code clarity and efficiency.

app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  st.set_page_config(layout="wide")
3
  import numpy as np
4
  import pandas as pd
5
- from fuzzywuzzy import process
6
  from collections import Counter
7
  from pymongo.mongo_client import MongoClient
8
  from pymongo.server_api import ServerApi
 
2
  st.set_page_config(layout="wide")
3
  import numpy as np
4
  import pandas as pd
5
+ from rapidfuzz import process, fuzz
6
  from collections import Counter
7
  from pymongo.mongo_client import MongoClient
8
  from pymongo.server_api import ServerApi
global_func/find_name_mismatches.py DELETED
@@ -1,99 +0,0 @@
1
- import streamlit as st
2
- from fuzzywuzzy import process
3
-
4
- def find_name_mismatches(contest_df, projections_df, ownership_df, fpts_df):
5
-
6
- name_columns = [col for col in contest_df.columns if not col in ['BaseName', 'EntryCount']]
7
-
8
- if 'player_names' not in projections_df.columns:
9
- st.error("No 'player_names' column found in projections file")
10
- return contest_df, projections_df
11
-
12
- # Get unique player names from portfolio and projections
13
- portfolio_players = set()
14
- for col in name_columns:
15
- portfolio_players.update(contest_df[col].unique())
16
- projection_players = set(projections_df['player_names'].unique())
17
- portfolio_players_list = list(portfolio_players)
18
- projection_players_list = list(projection_players)
19
-
20
- # Find players in portfolio that are missing from projections
21
- players_missing_from_projections = list(projection_players - portfolio_players)
22
-
23
- # Automatically handle 90%+ matches before starting interactive process
24
- auto_matches = {}
25
- players_to_process = []
26
- for player in players_missing_from_projections:
27
- if not isinstance(player, str):
28
- st.warning(f"Skipping non-string value: {player}")
29
- continue
30
- closest_matches = process.extract(player, portfolio_players_list, limit=1)
31
- if closest_matches[0][1] >= 95: # If high confidence match found
32
- match_name = closest_matches[0][0]
33
- auto_matches[player] = match_name
34
- st.success(f"Automatically matched '{player}' with '{match_name}' ({closest_matches[0][1]}% match)")
35
- elif closest_matches[0][1] >= 75:
36
- players_to_process.append(player)
37
- else:
38
- st.warning(f"No match found for '{player}'")
39
-
40
- if players_to_process:
41
- st.warning(f"Found {len(players_to_process)} players that need manual matching")
42
-
43
- # Create a form for batch processing
44
- with st.form("name_matching_form"):
45
- # Create tabs for each player
46
- tabs = st.tabs([f"Player {i+1}" for i in range(len(players_to_process))])
47
-
48
- # Dictionary to store selections
49
- selections = {}
50
-
51
- # Populate each tab
52
- for i, player in enumerate(players_to_process):
53
- with tabs[i]:
54
- st.write(f"**Projection Name:** {player}")
55
-
56
- # Find the top 3 closest matches
57
- closest_matches = process.extract(player, portfolio_players_list, limit=3)
58
-
59
- # Create radio buttons for selection
60
- options = [f"{match[0]} ({match[1]}%)" for match in closest_matches]
61
- options.append("None of these")
62
-
63
- selections[player] = st.radio(
64
- f"Select correct match:",
65
- options,
66
- key=f"radio_{player}"
67
- )
68
-
69
- # Submit button for the entire form
70
- submitted = st.form_submit_button("Apply All Changes")
71
-
72
- if submitted:
73
- # Process automatic matches
74
- for projection_name, contest_name in auto_matches.items():
75
- for col in name_columns:
76
- contest_df[col] = contest_df[col].replace(contest_name, projection_name)
77
- ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
78
- fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
79
-
80
- # Process manual selections
81
- for projection_name, selection in selections.items():
82
- if selection != "None of these":
83
- selected_name = selection.split(" (")[0]
84
- for col in name_columns:
85
- contest_df[col] = contest_df[col].replace(selected_name, projection_name)
86
- ownership_df['Player'] = ownership_df['Player'].replace(selected_name, projection_name)
87
- fpts_df['Player'] = fpts_df['Player'].replace(selected_name, projection_name)
88
- st.success(f"Replaced '{selected_name}' with '{projection_name}'")
89
- st.success("All changes applied successfully!")
90
- return contest_df, projections_df, ownership_df, fpts_df
91
- else:
92
- st.success("All players have been automatically matched!")
93
- # Apply automatic matches
94
- for projection_name, contest_name in auto_matches.items():
95
- for col in name_columns:
96
- contest_df[col] = contest_df[col].replace(contest_name, projection_name)
97
- ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
98
- fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
99
- return contest_df, projections_df, ownership_df, fpts_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
global_func/get_contest_names.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import time
5
+ from rapidfuzz import process, fuzz
6
+
7
+ def get_contest_names(contest_frame):
8
+ """
9
+ Get all unique names from the contest dataframe's player columns.
10
+
11
+ Args:
12
+ contest_frame: DataFrame containing contest data
13
+
14
+ Returns:
15
+ list: List of unique player names
16
+ """
17
+ # Get columns that contain player names (excluding non-player columns)
18
+ player_columns = [col for col in contest_frame.columns
19
+ if col not in ['BaseName', 'EntryCount']]
20
+
21
+ # Get all unique values from these columns
22
+ unique_names = contest_frame[player_columns].values.flatten()
23
+ unique_names = pd.unique(unique_names) # Remove duplicates
24
+ unique_names = unique_names[~pd.isna(unique_names)] # Remove any NaN values
25
+
26
+ return list(unique_names)
global_func/load_contest_file.py CHANGED
@@ -1,5 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
3
 
4
  def load_contest_file(upload, helper = None, sport = None):
5
  if sport == 'MLB':
@@ -52,6 +54,23 @@ def load_contest_file(upload, helper = None, sport = None):
52
  df_helper = helper_df[['Player', 'Salary', 'Team']]
53
 
54
  print('Made it through helper')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Create separate dataframes for different player attributes
57
  if helper is not None:
@@ -95,7 +114,7 @@ def load_contest_file(upload, helper = None, sport = None):
95
  cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
96
  elif sport == 'GOLF':
97
  cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
98
-
99
  print('Made it through check_lineups')
100
 
101
  # Get unique entry names
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from get_contest_names import get_contest_names
4
+ from rapidfuzz import process, fuzz
5
 
6
  def load_contest_file(upload, helper = None, sport = None):
7
  if sport == 'MLB':
 
54
  df_helper = helper_df[['Player', 'Salary', 'Team']]
55
 
56
  print('Made it through helper')
57
+
58
+ contest_names = df.Player.unique()
59
+ helper_names = helper_df.Player.unique()
60
+
61
+ contest_match_dict = {}
62
+ for names in helper_names:
63
+ match = process.extractOne(
64
+ names,
65
+ contest_names,
66
+ score_cutoff = 85
67
+ )
68
+ if match:
69
+ contest_match_dict[names] = match[0]
70
+ else:
71
+ contest_match_dict[names] = names
72
+
73
+ df_helper['Player'] = df_helper['Player'].map(contest_match_dict)
74
 
75
  # Create separate dataframes for different player attributes
76
  if helper is not None:
 
114
  cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
115
  elif sport == 'GOLF':
116
  cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
117
+
118
  print('Made it through check_lineups')
119
 
120
  # Get unique entry names