DFS_Contest_Analyzer / global_func /find_csv_mismatches.py
James McCool
Add functionality for player name cleaning and CSV mismatch detection
d04558f
raw
history blame
4.28 kB
import streamlit as st
import numpy as np
import pandas as pd
from fuzzywuzzy import process
def find_csv_mismatches(csv_df, projections_df):
# Create copies of the dataframes to avoid modifying the originals
csv_df = csv_df.copy()
projections_df = projections_df.copy()
if 'Name' not in csv_df.columns:
st.error("No 'Name' column found in CSV file")
return csv_df
if 'player_names' not in projections_df.columns:
st.error("No 'player_names' column found in projections file")
return csv_df
# Get unique player names from CSV and projections
csv_players = set(csv_df['Name'].dropna().unique())
projection_players = set(projections_df['player_names'].unique())
projection_players_list = list(csv_players)
# Find players in CSV that are missing from projections
players_missing_from_projections = list(projection_players - csv_players)
# Automatically handle 100% matches before starting interactive process
players_to_process = []
for player in players_missing_from_projections:
if not isinstance(player, str):
st.warning(f"Skipping non-string value: {player}")
continue
closest_matches = process.extract(player, projection_players_list, limit=1)
if closest_matches[0][1] == 100: # If perfect match found
match_name = closest_matches[0][0]
# Update CSV DataFrame to use the projection name
csv_df.loc[csv_df['Name'] == player, 'Name'] = match_name
st.success(f"Automatically matched '{player}' with '{match_name}' (100% match)")
else:
players_to_process.append(player)
# Initialize session state for tracking current player if not exists
if 'csv_current_player_index' not in st.session_state:
st.session_state.csv_current_player_index = 0
st.session_state.csv_players_to_process = players_to_process
# Display results
if players_missing_from_projections:
st.warning("Players in CSV but missing from projections")
# Display remaining players
remaining_players = st.session_state.csv_players_to_process[st.session_state.csv_current_player_index:]
st.info(f"Remaining players to process ({len(remaining_players)}):\n" +
"\n".join(f"- {player}" for player in remaining_players))
if st.session_state.csv_current_player_index < len(st.session_state.csv_players_to_process):
current_player = st.session_state.csv_players_to_process[st.session_state.csv_current_player_index]
# Find the top 3 closest matches
closest_matches = process.extract(current_player, projection_players_list, limit=3)
st.write(f"**Missing Player {st.session_state.csv_current_player_index + 1} of {len(st.session_state.csv_players_to_process)}:** {current_player}")
# Create radio buttons for selection
options = [f"{match[0]} ({match[1]}%)" for match in closest_matches]
options.append("None of these")
selected_option = st.radio(
f"Select correct match:",
options,
key=f"csv_radio_{current_player}"
)
if st.button("Confirm Selection", key="csv_confirm"):
if selected_option != "None of these":
selected_name = selected_option.split(" (")[0]
# Update CSV DataFrame
csv_df.loc[csv_df['Name'] == current_player, 'Name'] = selected_name
st.success(f"Replaced '{current_player}' with '{selected_name}'")
st.session_state['csv_file'] = csv_df
# Move to next player
st.session_state.csv_current_player_index += 1
st.rerun()
else:
st.success("All players have been processed!")
# Reset the index for future runs
st.session_state.csv_current_player_index = 0
st.session_state.csv_players_to_process = []
else:
st.success("All CSV players found in projections!")
return csv_df