Spaces:

seanpedrickcase
/

address_matcher

Running

File size: 24,757 Bytes

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Type
from datetime import datetime
from rapidfuzz import fuzz, process
import gradio as gr

PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]

today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")

from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit

def string_match_array(to_match:array, choices:array,
                      index_name:str, matched_name:str) -> PandasDataFrame:
    
    temp = {name: process.extractOne(name,choices) 
            for name in to_match}
    
    return _create_frame(matched_results=temp, index_name=index_name,
                        matched_name=matched_name)

# Fuzzy match algorithm
def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):

    results = []

    for orig_index, orig_string in df[orig_match_address_series].items():
        
        predict_string = df[pred_match_address_series][orig_index] 
        
        if (orig_string == '') and (predict_string == ''):
            results.append(np.nan)
            
        else:
            fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
            results.append(fuzz_score[0][1])

    new_result_col_score = (orig_match_address_series + "_fuzz_score")
    new_result_col_match = (orig_match_address_series + "_fuzz_match") 

    df[new_result_col_score] = results
    df[new_result_col_match] = df[new_result_col_score] >= match_score
    #df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
    df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
    
    return df

def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
                              search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
    '''
    Matches by Series values; for example idx is post code and 
    values address. Search field is reduced by comparing same post codes address reference_address_series.
    
    Default scorer is fuzz.Wratio. This tries to weight the different algorithms
    to give the best score.
    Choice of ratio type seems to make a big difference. Looking at this link:
    https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
    and this one: 
    https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings    

    '''

    def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:

        def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):

            # Create a list to store the results
            results = []

            # Iterate through the matched dataframe and store results in the list
            for i, search_address in enumerate(search_addresses):
                for j, reference_address in enumerate(reference_addresses):
                    score = matched[i][j]
                    results.append((postcode_match, search_address, reference_address, score))

            # Create a dataframe from the results list
            matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])

            return matched_out

        try:
            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            else:  # 1+ addresses
                matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1) 

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            # Sort the matched results by score in descending order
            matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)

            # Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
            #matched_out = matched_out.head(search_limit)

        except KeyError:
            matched_out = pd.DataFrame()

        return matched_out
 
    def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
        
        try:
            matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
            return matched
        except KeyError:
            matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
            return matched

    print("Fuzzy match column length: ", len(match_address_series))
    print("Fuzzy Reference column length: ", len(reference_address_series))

    match_address_series = match_address_series.rename_axis('postcode_search')
    match_address_df = pd.DataFrame(match_address_series.reset_index())
    match_address_df['index'] = list(range(0,len(match_address_df)))

    reference_address_series = reference_address_series.rename_axis('postcode_search')
    reference_address_df = pd.DataFrame(reference_address_series.reset_index())
    reference_address_df['index'] = list(range(0,len(reference_address_df)))

    
    # Apply the match functions to each address
    scorer = getattr(fuzz, scorer_name)                  
    results = {}
    #counter = 0

    index_list = []
    match_list = []
    search_addresses_list = []
    reference_addresses_list = []

    unique_postcodes = pd.unique(match_address_df['postcode_search'])

    for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):

        postcode_match_list = [postcode_match]
        search_indexes = pd.Series()
        search_addresses = pd.Series()
        reference_addresses = pd.Series()

        try:
            search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
            search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
            reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]

            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                reference_addresses = pd.Series(reference_addresses)
        except KeyError:
            reference_addresses = pd.Series("NA")

        matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)

        # Write to output lists
        match_list.extend([matched])
        index_list.extend(search_indexes.tolist())
        search_addresses_list.extend(search_addresses.tolist())
        reference_addresses_list.extend(reference_addresses.tolist())

    out_frame = pd.concat(match_list)

    return out_frame

def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):

        '''
        Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
        '''

        ## Diagnostics

        diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
                                      matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
                                      fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
        
        ## Fuzzy search results
        match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'property_number_match',
        'close_postcode_match',
        'house_court_name_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'unit_number_search', 'unit_number_reference',
        'block_number_search', 'block_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address','Postcode']

        # Join results data onto the original housing list to create the full output
        search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]

        match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
            diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
        
        match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
        
        # Join UPRN back onto the data from reference data
        joined_ref_cols = ["fulladdress", "Reference file"]
        joined_ref_cols.extend(new_join_col)

        #print("joined_ref_cols: ", joined_ref_cols)
        # Keep only columns that exist in reference dataset
        joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]

        match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)

        # Convert long keys to string to avoid data loss
        match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
        match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
        match_results_output["standardised_address"] = standardise
    
        match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
                
        return match_results_output, diag_shortlist, diag_best_match

def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
    '''
    Create a shortlist of the best matches from a list of suggested matches
    '''

    ## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
    results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)

    results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)

    diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]

    # Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
    #diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
    diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True

    ### Count number of numbers in search string
    # Using .loc
    diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
    diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)


    # Replace fuzzy_score_match values for addresses with no numbers in them
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False

    # If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
    if blocker_col == "Street":
        diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
                            
    diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)

    # Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
    #print(diag_shortlist.columns) 
    diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
    diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
    diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
    diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
    diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
    diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])

    # Full number match is currently considered only a match between property number and flat number
                            
    diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
        (diag_shortlist["flat_number_match"] == True) &\
        (diag_shortlist["room_number_match"] == True) &\
        (diag_shortlist["block_number_match"] == True) &\
        (diag_shortlist["unit_number_match"] == True) &\
        (diag_shortlist["house_court_name_match"] == True)

    
    ### Postcodes need to be close together, so all the characters should match apart from the last two 
    diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
        
    
    diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
        (diag_shortlist['full_number_match'] == True) &\
        (diag_shortlist['close_postcode_match'] == True)
    
    diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})

    ### Dealing with tie breaks ##
    # Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
    # fuzz.WRatio
    if resolve_tie_breaks == True:
        def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
            search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
            return search_score[0][0]

        diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
        diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]

        if not diag_shortlist_dups.empty:
            diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
                                
            diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")

    if 'wratio_score' not in diag_shortlist.columns:
        diag_shortlist['wratio_score'] = '' 

    # Order by best score
    diag_shortlist = diag_shortlist.sort_values([
        search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
        ascending = [True, False, False, False, False])          

    return diag_shortlist

def refine_export_results(results_df:PandasDataFrame, 
                           matched_df:PandasDataFrame,
                           ref_list_df:PandasDataFrame,
                           matched_col="fuzzy_match_search_address",
                           ref_list_col="fuzzy_match_reference_address",
                           final_matched_address_col="search_address_stand",
                           final_ref_address_col="ref_address_stand",
                           orig_matched_address_col = "full_address",
                           orig_ref_address_col = "fulladdress",
                           fuzzy_match_limit=fuzzy_match_limit,
                           blocker_col="Postcode") -> PandasDataFrame:
    '''
    This function takes a result file from the fuzzy search, then refines the 'matched results' according
    the score limit specified by the user and exports results list, matched and unmatched files.
    '''
       
    # Rename score column
    results_df = results_df.rename(columns = {"score":"fuzzy_score"})
          
    # Remove empty addresses
    results_df = results_df[results_df[matched_col] !=0 ]

    ### Join property number and flat/room number etc. onto results_df
    ref_list_df["ref_index"] = ref_list_df.index
    ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
    ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})

    results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")


    ### Join on relevant details from the standardised match dataframe
    matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
    matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
    
    results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
    
    # Choose your best matches from the list of options
    diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)

    ### Create matched results output ###
    # Columns for the output match_results file in order
    match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'house_court_name_match',
        'property_number_match',
        'close_postcode_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'block_number_search', 'block_number_reference',
        'unit_number_search', 'unit_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address', 'postcode','Postcode']

    diag_shortlist = diag_shortlist[match_results_cols]

    # Choose best match from the shortlist that has been ordered according to score descending
    diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
   
    return diag_shortlist, diag_best_match

def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
    ''' 
    Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
    '''
    match_results_output_success = match_results_output[match_results_output["full_match"]==True]

    # If you're joining to the original df on index you will need to recreate the index again 

    match_results_output_success = match_results_output_success.rename(columns={
                                        "reference_orig_address":"Reference matched address",
                                        "full_match":"Matched with reference address",
                                        'uprn':'UPRN'                                                                             
                                     }, errors="ignore")
    
    ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
    ref_df_after_stand_cols.extend(new_join_col)
 
    
    if (search_df_key_field == "index"):
        # Check index is int
        print("Search df key field is index")
        #match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)     
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))  
    else:
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))

    # If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column

    if "Reference matched address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)

    if "Matched with reference address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))

        #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)

    if "Reference file_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)

    if "UPRN_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)

    # Drop columns that aren't useful
    results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
                                   "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")

    # Replace blanks with NA, fix UPRNs
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)   

    results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
    
    # Replace cells with only 'nan' with blank
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
  
    
    return results_for_orig_df_join