import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Type
from datetime import datetime
from rapidfuzz import fuzz, process
import gradio as gr

PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]

today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")

from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit

def string_match_array(to_match:array, choices:array,
                      index_name:str, matched_name:str) -> PandasDataFrame:
    
    temp = {name: process.extractOne(name,choices) 
            for name in to_match}
    
    return _create_frame(matched_results=temp, index_name=index_name,
                        matched_name=matched_name)

# Fuzzy match algorithm
def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):

    results = []

    for orig_index, orig_string in df[orig_match_address_series].items():
        
        predict_string = df[pred_match_address_series][orig_index] 
        
        if (orig_string == '') and (predict_string == ''):
            results.append(np.nan)
            
        else:
            fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
            results.append(fuzz_score[0][1])

    new_result_col_score = (orig_match_address_series + "_fuzz_score")
    new_result_col_match = (orig_match_address_series + "_fuzz_match") 

    df[new_result_col_score] = results
    df[new_result_col_match] = df[new_result_col_score] >= match_score
    #df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
    df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
    
    return df

def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
                              search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
    '''
    Matches by Series values; for example idx is post code and 
    values address. Search field is reduced by comparing same post codes address reference_address_series.
    
    Default scorer is fuzz.Wratio. This tries to weight the different algorithms
    to give the best score.
    Choice of ratio type seems to make a big difference. Looking at this link:
    https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
    and this one: 
    https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings    

    '''

    def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:

        def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):

            # Create a list to store the results
            results = []

            # Iterate through the matched dataframe and store results in the list
            for i, search_address in enumerate(search_addresses):
                for j, reference_address in enumerate(reference_addresses):
                    score = matched[i][j]
                    results.append((postcode_match, search_address, reference_address, score))

            # Create a dataframe from the results list
            matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])

            return matched_out

        try:
            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            else:  # 1+ addresses
                matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1) 

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            # Sort the matched results by score in descending order
            matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)

            # Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
            #matched_out = matched_out.head(search_limit)

        except KeyError:
            matched_out = pd.DataFrame()

        return matched_out
 
    def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
        
        try:
            matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
            return matched
        except KeyError:
            matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
            return matched

    print("Fuzzy match column length: ", len(match_address_series))
    print("Fuzzy Reference column length: ", len(reference_address_series))

    match_address_series = match_address_series.rename_axis('postcode_search')
    match_address_df = pd.DataFrame(match_address_series.reset_index())
    match_address_df['index'] = list(range(0,len(match_address_df)))

    reference_address_series = reference_address_series.rename_axis('postcode_search')
    reference_address_df = pd.DataFrame(reference_address_series.reset_index())
    reference_address_df['index'] = list(range(0,len(reference_address_df)))

    
    # Apply the match functions to each address
    scorer = getattr(fuzz, scorer_name)                  
    results = {}
    #counter = 0

    index_list = []
    match_list = []
    search_addresses_list = []
    reference_addresses_list = []

    unique_postcodes = pd.unique(match_address_df['postcode_search'])

    for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):

        postcode_match_list = [postcode_match]
        search_indexes = pd.Series()
        search_addresses = pd.Series()
        reference_addresses = pd.Series()

        try:
            search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
            search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
            reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]

            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                reference_addresses = pd.Series(reference_addresses)
        except KeyError:
            reference_addresses = pd.Series("NA")

        matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)

        # Write to output lists
        match_list.extend([matched])
        index_list.extend(search_indexes.tolist())
        search_addresses_list.extend(search_addresses.tolist())
        reference_addresses_list.extend(reference_addresses.tolist())

    out_frame = pd.concat(match_list)

    return out_frame

def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):

        '''
        Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
        '''

        ## Diagnostics

        diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
                                      matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
                                      fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
        
        ## Fuzzy search results
        match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'property_number_match',
        'close_postcode_match',
        'house_court_name_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'unit_number_search', 'unit_number_reference',
        'block_number_search', 'block_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address','Postcode']

        # Join results data onto the original housing list to create the full output
        search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]

        match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
            diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
        
        match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
        
        # Join UPRN back onto the data from reference data
        joined_ref_cols = ["fulladdress", "Reference file"]
        joined_ref_cols.extend(new_join_col)

        #print("joined_ref_cols: ", joined_ref_cols)
        # Keep only columns that exist in reference dataset
        joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]

        match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)

        # Convert long keys to string to avoid data loss
        match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
        match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
        match_results_output["standardised_address"] = standardise
    
        match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
                
        return match_results_output, diag_shortlist, diag_best_match

def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
    '''
    Create a shortlist of the best matches from a list of suggested matches
    '''

    ## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
    results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)

    results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)

    diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]

    # Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
    #diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
    diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True

    ### Count number of numbers in search string
    # Using .loc
    diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
    diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)


    # Replace fuzzy_score_match values for addresses with no numbers in them
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False

    # If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
    if blocker_col == "Street":
        diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
                            
    diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)

    # Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
    #print(diag_shortlist.columns) 
    diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
    diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
    diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
    diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
    diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
    diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])

    # Full number match is currently considered only a match between property number and flat number
                            
    diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
        (diag_shortlist["flat_number_match"] == True) &\
        (diag_shortlist["room_number_match"] == True) &\
        (diag_shortlist["block_number_match"] == True) &\
        (diag_shortlist["unit_number_match"] == True) &\
        (diag_shortlist["house_court_name_match"] == True)

    
    ### Postcodes need to be close together, so all the characters should match apart from the last two 
    diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
        
    
    diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
        (diag_shortlist['full_number_match'] == True) &\
        (diag_shortlist['close_postcode_match'] == True)
    
    diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})

    ### Dealing with tie breaks ##
    # Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
    # fuzz.WRatio
    if resolve_tie_breaks == True:
        def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
            search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
            return search_score[0][0]

        diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
        diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]

        if not diag_shortlist_dups.empty:
            diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
                                
            diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")

    if 'wratio_score' not in diag_shortlist.columns:
        diag_shortlist['wratio_score'] = '' 

    # Order by best score
    diag_shortlist = diag_shortlist.sort_values([
        search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
        ascending = [True, False, False, False, False])          

    return diag_shortlist

def refine_export_results(results_df:PandasDataFrame, 
                           matched_df:PandasDataFrame,
                           ref_list_df:PandasDataFrame,
                           matched_col="fuzzy_match_search_address",
                           ref_list_col="fuzzy_match_reference_address",
                           final_matched_address_col="search_address_stand",
                           final_ref_address_col="ref_address_stand",
                           orig_matched_address_col = "full_address",
                           orig_ref_address_col = "fulladdress",
                           fuzzy_match_limit=fuzzy_match_limit,
                           blocker_col="Postcode") -> PandasDataFrame:
    '''
    This function takes a result file from the fuzzy search, then refines the 'matched results' according
    the score limit specified by the user and exports results list, matched and unmatched files.
    '''
       
    # Rename score column
    results_df = results_df.rename(columns = {"score":"fuzzy_score"})
          
    # Remove empty addresses
    results_df = results_df[results_df[matched_col] !=0 ]

    ### Join property number and flat/room number etc. onto results_df
    ref_list_df["ref_index"] = ref_list_df.index
    ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
    ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})

    results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")


    ### Join on relevant details from the standardised match dataframe
    matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
    matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
    
    results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
    
    # Choose your best matches from the list of options
    diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)

    ### Create matched results output ###
    # Columns for the output match_results file in order
    match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'house_court_name_match',
        'property_number_match',
        'close_postcode_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'block_number_search', 'block_number_reference',
        'unit_number_search', 'unit_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address', 'postcode','Postcode']

    diag_shortlist = diag_shortlist[match_results_cols]

    # Choose best match from the shortlist that has been ordered according to score descending
    diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
   
    return diag_shortlist, diag_best_match

def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
    ''' 
    Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
    '''
    match_results_output_success = match_results_output[match_results_output["full_match"]==True]

    # If you're joining to the original df on index you will need to recreate the index again 

    match_results_output_success = match_results_output_success.rename(columns={
                                        "reference_orig_address":"Reference matched address",
                                        "full_match":"Matched with reference address",
                                        'uprn':'UPRN'                                                                             
                                     }, errors="ignore")
    
    ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
    ref_df_after_stand_cols.extend(new_join_col)
 
    
    if (search_df_key_field == "index"):
        # Check index is int
        print("Search df key field is index")
        #match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)     
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))  
    else:
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))

    # If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column

    if "Reference matched address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)

    if "Matched with reference address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))

        #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)

    if "Reference file_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)

    if "UPRN_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)

    # Drop columns that aren't useful
    results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
                                   "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")

    # Replace blanks with NA, fix UPRNs
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)   

    results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
    
    # Replace cells with only 'nan' with blank
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
  
    
    return results_for_orig_df_join