James McCool
Refactor name matching logic in `find_name_mismatches.py` to improve data handling
2c1be4a
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import time | |
from fuzzywuzzy import process | |
def find_name_mismatches(contest_df, projections_df): | |
# Create a copy of the projections dataframe to avoid modifying the original | |
projections_df = projections_df.copy() | |
contest_df = contest_df.copy() | |
name_columns = [col for col in contest_df.columns if not col in ['BaseName', 'EntryCount']] | |
if 'player_names' not in projections_df.columns: | |
st.error("No 'player_names' column found in projections file") | |
return contest_df, projections_df | |
# Get unique player names from portfolio and projections | |
portfolio_players = set() | |
for col in name_columns: | |
portfolio_players.update(contest_df[col].unique()) | |
projection_players = set(projections_df['player_names'].unique()) | |
projection_players_list = list(projection_players) | |
# Find players in portfolio that are missing from projections | |
players_missing_from_projections = list(portfolio_players - projection_players) | |
# Automatically handle 100% matches before starting interactive process | |
players_to_process = [] | |
for player in players_missing_from_projections: | |
if not isinstance(player, str): | |
st.warning(f"Skipping non-string value: {player}") | |
continue | |
closest_matches = process.extract(player, projection_players_list, limit=1) | |
if closest_matches[0][1] == 100: # If perfect match found | |
match_name = closest_matches[0][0] | |
# Update all occurrences in contest_df | |
for col in name_columns: | |
contest_df[col] = contest_df[col].replace(player, match_name) | |
st.success(f"Automatically matched '{player}' with '{match_name}' (100% match)") | |
else: | |
players_to_process.append(player) | |
# Initialize session state for tracking current player if not exists | |
if 'current_player_index' not in st.session_state: | |
st.session_state.current_player_index = 0 | |
st.session_state.players_to_process = players_to_process | |
# Display results | |
if players_missing_from_projections: | |
st.warning("Players in portfolio but missing from projections") | |
# Display remaining players | |
remaining_players = st.session_state.players_to_process[st.session_state.current_player_index:] | |
st.info(f"Remaining players to process ({len(remaining_players)}):\n" + | |
"\n".join(f"- {player}" for player in remaining_players)) | |
if st.session_state.current_player_index < len(st.session_state.players_to_process): | |
current_player = st.session_state.players_to_process[st.session_state.current_player_index] | |
# Find the top 3 closest matches | |
closest_matches = process.extract(current_player, projection_players_list, limit=3) | |
st.write(f"**Missing Player {st.session_state.current_player_index + 1} of {len(st.session_state.players_to_process)}:** {current_player}") | |
# Create radio buttons for selection | |
options = [f"{match[0]} ({match[1]}%)" for match in closest_matches] | |
options.append("None of these") | |
selected_option = st.radio( | |
f"Select correct match:", | |
options, | |
key=f"radio_{current_player}" | |
) | |
if st.button("Confirm Selection"): | |
if selected_option != "None of these": | |
selected_name = selected_option.split(" (")[0] | |
# Update all occurrences in contest_df | |
for col in name_columns: | |
contest_df[col] = contest_df[col].replace(current_player, selected_name) | |
st.success(f"Replaced '{current_player}' with '{selected_name}'") | |
st.session_state['contest_df'] = contest_df | |
# Move to next player | |
st.session_state.current_player_index += 1 | |
st.rerun() | |
else: | |
st.success("All players have been processed!") | |
# Reset the index for future runs | |
st.session_state.current_player_index = 0 | |
st.session_state.players_to_process = [] | |
else: | |
st.success("All portfolio players found in projections!") | |
return contest_df, projections_df |