James McCool
commited on
Commit
·
91e473e
1
Parent(s):
5de7ed9
Refactor name matching logic and update dependencies
Browse files- Replaced the fuzzywuzzy library with rapidfuzz for improved performance in name matching operations.
- Removed the deprecated find_name_mismatches function and introduced a new get_contest_names function to streamline the retrieval of unique player names from contest data.
- Enhanced the load_contest_file function to utilize the new name matching logic, ensuring consistent player name handling across the application.
- Maintained existing functionality while improving code clarity and efficiency.
- app.py +1 -1
- global_func/find_name_mismatches.py +0 -99
- global_func/get_contest_names.py +26 -0
- global_func/load_contest_file.py +20 -1
app.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
st.set_page_config(layout="wide")
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
-
from
|
6 |
from collections import Counter
|
7 |
from pymongo.mongo_client import MongoClient
|
8 |
from pymongo.server_api import ServerApi
|
|
|
2 |
st.set_page_config(layout="wide")
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
+
from rapidfuzz import process, fuzz
|
6 |
from collections import Counter
|
7 |
from pymongo.mongo_client import MongoClient
|
8 |
from pymongo.server_api import ServerApi
|
global_func/find_name_mismatches.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from fuzzywuzzy import process
|
3 |
-
|
4 |
-
def find_name_mismatches(contest_df, projections_df, ownership_df, fpts_df):
|
5 |
-
|
6 |
-
name_columns = [col for col in contest_df.columns if not col in ['BaseName', 'EntryCount']]
|
7 |
-
|
8 |
-
if 'player_names' not in projections_df.columns:
|
9 |
-
st.error("No 'player_names' column found in projections file")
|
10 |
-
return contest_df, projections_df
|
11 |
-
|
12 |
-
# Get unique player names from portfolio and projections
|
13 |
-
portfolio_players = set()
|
14 |
-
for col in name_columns:
|
15 |
-
portfolio_players.update(contest_df[col].unique())
|
16 |
-
projection_players = set(projections_df['player_names'].unique())
|
17 |
-
portfolio_players_list = list(portfolio_players)
|
18 |
-
projection_players_list = list(projection_players)
|
19 |
-
|
20 |
-
# Find players in portfolio that are missing from projections
|
21 |
-
players_missing_from_projections = list(projection_players - portfolio_players)
|
22 |
-
|
23 |
-
# Automatically handle 90%+ matches before starting interactive process
|
24 |
-
auto_matches = {}
|
25 |
-
players_to_process = []
|
26 |
-
for player in players_missing_from_projections:
|
27 |
-
if not isinstance(player, str):
|
28 |
-
st.warning(f"Skipping non-string value: {player}")
|
29 |
-
continue
|
30 |
-
closest_matches = process.extract(player, portfolio_players_list, limit=1)
|
31 |
-
if closest_matches[0][1] >= 95: # If high confidence match found
|
32 |
-
match_name = closest_matches[0][0]
|
33 |
-
auto_matches[player] = match_name
|
34 |
-
st.success(f"Automatically matched '{player}' with '{match_name}' ({closest_matches[0][1]}% match)")
|
35 |
-
elif closest_matches[0][1] >= 75:
|
36 |
-
players_to_process.append(player)
|
37 |
-
else:
|
38 |
-
st.warning(f"No match found for '{player}'")
|
39 |
-
|
40 |
-
if players_to_process:
|
41 |
-
st.warning(f"Found {len(players_to_process)} players that need manual matching")
|
42 |
-
|
43 |
-
# Create a form for batch processing
|
44 |
-
with st.form("name_matching_form"):
|
45 |
-
# Create tabs for each player
|
46 |
-
tabs = st.tabs([f"Player {i+1}" for i in range(len(players_to_process))])
|
47 |
-
|
48 |
-
# Dictionary to store selections
|
49 |
-
selections = {}
|
50 |
-
|
51 |
-
# Populate each tab
|
52 |
-
for i, player in enumerate(players_to_process):
|
53 |
-
with tabs[i]:
|
54 |
-
st.write(f"**Projection Name:** {player}")
|
55 |
-
|
56 |
-
# Find the top 3 closest matches
|
57 |
-
closest_matches = process.extract(player, portfolio_players_list, limit=3)
|
58 |
-
|
59 |
-
# Create radio buttons for selection
|
60 |
-
options = [f"{match[0]} ({match[1]}%)" for match in closest_matches]
|
61 |
-
options.append("None of these")
|
62 |
-
|
63 |
-
selections[player] = st.radio(
|
64 |
-
f"Select correct match:",
|
65 |
-
options,
|
66 |
-
key=f"radio_{player}"
|
67 |
-
)
|
68 |
-
|
69 |
-
# Submit button for the entire form
|
70 |
-
submitted = st.form_submit_button("Apply All Changes")
|
71 |
-
|
72 |
-
if submitted:
|
73 |
-
# Process automatic matches
|
74 |
-
for projection_name, contest_name in auto_matches.items():
|
75 |
-
for col in name_columns:
|
76 |
-
contest_df[col] = contest_df[col].replace(contest_name, projection_name)
|
77 |
-
ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
|
78 |
-
fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
|
79 |
-
|
80 |
-
# Process manual selections
|
81 |
-
for projection_name, selection in selections.items():
|
82 |
-
if selection != "None of these":
|
83 |
-
selected_name = selection.split(" (")[0]
|
84 |
-
for col in name_columns:
|
85 |
-
contest_df[col] = contest_df[col].replace(selected_name, projection_name)
|
86 |
-
ownership_df['Player'] = ownership_df['Player'].replace(selected_name, projection_name)
|
87 |
-
fpts_df['Player'] = fpts_df['Player'].replace(selected_name, projection_name)
|
88 |
-
st.success(f"Replaced '{selected_name}' with '{projection_name}'")
|
89 |
-
st.success("All changes applied successfully!")
|
90 |
-
return contest_df, projections_df, ownership_df, fpts_df
|
91 |
-
else:
|
92 |
-
st.success("All players have been automatically matched!")
|
93 |
-
# Apply automatic matches
|
94 |
-
for projection_name, contest_name in auto_matches.items():
|
95 |
-
for col in name_columns:
|
96 |
-
contest_df[col] = contest_df[col].replace(contest_name, projection_name)
|
97 |
-
ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
|
98 |
-
fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
|
99 |
-
return contest_df, projections_df, ownership_df, fpts_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
global_func/get_contest_names.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import time
|
5 |
+
from rapidfuzz import process, fuzz
|
6 |
+
|
7 |
+
def get_contest_names(contest_frame):
|
8 |
+
"""
|
9 |
+
Get all unique names from the contest dataframe's player columns.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
contest_frame: DataFrame containing contest data
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
list: List of unique player names
|
16 |
+
"""
|
17 |
+
# Get columns that contain player names (excluding non-player columns)
|
18 |
+
player_columns = [col for col in contest_frame.columns
|
19 |
+
if col not in ['BaseName', 'EntryCount']]
|
20 |
+
|
21 |
+
# Get all unique values from these columns
|
22 |
+
unique_names = contest_frame[player_columns].values.flatten()
|
23 |
+
unique_names = pd.unique(unique_names) # Remove duplicates
|
24 |
+
unique_names = unique_names[~pd.isna(unique_names)] # Remove any NaN values
|
25 |
+
|
26 |
+
return list(unique_names)
|
global_func/load_contest_file.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
3 |
|
4 |
def load_contest_file(upload, helper = None, sport = None):
|
5 |
if sport == 'MLB':
|
@@ -52,6 +54,23 @@ def load_contest_file(upload, helper = None, sport = None):
|
|
52 |
df_helper = helper_df[['Player', 'Salary', 'Team']]
|
53 |
|
54 |
print('Made it through helper')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# Create separate dataframes for different player attributes
|
57 |
if helper is not None:
|
@@ -95,7 +114,7 @@ def load_contest_file(upload, helper = None, sport = None):
|
|
95 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
96 |
elif sport == 'GOLF':
|
97 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
98 |
-
|
99 |
print('Made it through check_lineups')
|
100 |
|
101 |
# Get unique entry names
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from get_contest_names import get_contest_names
|
4 |
+
from rapidfuzz import process, fuzz
|
5 |
|
6 |
def load_contest_file(upload, helper = None, sport = None):
|
7 |
if sport == 'MLB':
|
|
|
54 |
df_helper = helper_df[['Player', 'Salary', 'Team']]
|
55 |
|
56 |
print('Made it through helper')
|
57 |
+
|
58 |
+
contest_names = df.Player.unique()
|
59 |
+
helper_names = helper_df.Player.unique()
|
60 |
+
|
61 |
+
contest_match_dict = {}
|
62 |
+
for names in helper_names:
|
63 |
+
match = process.extractOne(
|
64 |
+
names,
|
65 |
+
contest_names,
|
66 |
+
score_cutoff = 85
|
67 |
+
)
|
68 |
+
if match:
|
69 |
+
contest_match_dict[names] = match[0]
|
70 |
+
else:
|
71 |
+
contest_match_dict[names] = names
|
72 |
+
|
73 |
+
df_helper['Player'] = df_helper['Player'].map(contest_match_dict)
|
74 |
|
75 |
# Create separate dataframes for different player attributes
|
76 |
if helper is not None:
|
|
|
114 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
115 |
elif sport == 'GOLF':
|
116 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
117 |
+
|
118 |
print('Made it through check_lineups')
|
119 |
|
120 |
# Get unique entry names
|