import pandas as pd import numpy as np from typing import Dict, List, Tuple, Type from datetime import datetime from rapidfuzz import fuzz, process import gradio as gr PandasDataFrame = Type[pd.DataFrame] PandasSeries = Type[pd.Series] MatchedResults = Dict[str,Tuple[str,int]] array = List[str] today = datetime.now().strftime("%d%m%Y") today_rev = datetime.now().strftime("%Y%m%d") from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit def string_match_array(to_match:array, choices:array, index_name:str, matched_name:str) -> PandasDataFrame: temp = {name: process.extractOne(name,choices) for name in to_match} return _create_frame(matched_results=temp, index_name=index_name, matched_name=matched_name) # Fuzzy match algorithm def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95): results = [] for orig_index, orig_string in df[orig_match_address_series].items(): predict_string = df[pred_match_address_series][orig_index] if (orig_string == '') and (predict_string == ''): results.append(np.nan) else: fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method)) results.append(fuzz_score[0][1]) new_result_col_score = (orig_match_address_series + "_fuzz_score") new_result_col_match = (orig_match_address_series + "_fuzz_match") df[new_result_col_score] = results df[new_result_col_match] = df[new_result_col_score] >= match_score #df[new_result_col_match][df[new_result_col_score].isna()] = np.nan df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan return df def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries, search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults: ''' Matches by Series values; for example idx is post code and values address. Search field is reduced by comparing same post codes address reference_address_series. Default scorer is fuzz.Wratio. This tries to weight the different algorithms to give the best score. Choice of ratio type seems to make a big difference. Looking at this link: https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ and this one: https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings ''' def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults: def _prepare_results(search_addresses, reference_addresses, matched, postcode_match): # Create a list to store the results results = [] # Iterate through the matched dataframe and store results in the list for i, search_address in enumerate(search_addresses): for j, reference_address in enumerate(reference_addresses): score = matched[i][j] results.append((postcode_match, search_address, reference_address, score)) # Create a dataframe from the results list matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score']) return matched_out try: if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1) # Transform results into a dataframe matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match) else: # 1+ addresses matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1) # Transform results into a dataframe matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match) # Sort the matched results by score in descending order matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False) # Keep only the top search_limit number of results - doesn't work anymore when working with multiple results #matched_out = matched_out.head(search_limit) except KeyError: matched_out = pd.DataFrame() return matched_out def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple: try: matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses) return matched except KeyError: matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)] return matched print("Fuzzy match column length: ", len(match_address_series)) print("Fuzzy Reference column length: ", len(reference_address_series)) match_address_series = match_address_series.rename_axis('postcode_search') match_address_df = pd.DataFrame(match_address_series.reset_index()) match_address_df['index'] = list(range(0,len(match_address_df))) reference_address_series = reference_address_series.rename_axis('postcode_search') reference_address_df = pd.DataFrame(reference_address_series.reset_index()) reference_address_df['index'] = list(range(0,len(reference_address_df))) # Apply the match functions to each address scorer = getattr(fuzz, scorer_name) results = {} #counter = 0 index_list = [] match_list = [] search_addresses_list = [] reference_addresses_list = [] unique_postcodes = pd.unique(match_address_df['postcode_search']) for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"): postcode_match_list = [postcode_match] search_indexes = pd.Series() search_addresses = pd.Series() reference_addresses = pd.Series() try: search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"] search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"] reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"] if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode reference_addresses = pd.Series(reference_addresses) except KeyError: reference_addresses = pd.Series("NA") matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit) # Write to output lists match_list.extend([matched]) index_list.extend(search_indexes.tolist()) search_addresses_list.extend(search_addresses.tolist()) reference_addresses_list.extend(reference_addresses.tolist()) out_frame = pd.concat(match_list) return out_frame def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str): ''' Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export ''' ## Diagnostics diag_shortlist, diag_best_match = refine_export_results(results_df=results,\ matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand, fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col) ## Fuzzy search results match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index', 'full_match', 'full_number_match', 'flat_number_match', 'room_number_match', 'block_number_match', 'unit_number_match', 'property_number_match', 'close_postcode_match', 'house_court_name_match', 'fuzzy_score_match', "fuzzy_score", "wratio_score", 'property_number_search', 'property_number_reference', 'flat_number_search', 'flat_number_reference', 'room_number_search', 'room_number_reference', 'unit_number_search', 'unit_number_reference', 'block_number_search', 'block_number_reference', 'house_court_name_search', 'house_court_name_reference', "search_mod_address", 'reference_mod_address','Postcode'] # Join results data onto the original housing list to create the full output search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"] match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge( diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address") match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"}) # Join UPRN back onto the data from reference data joined_ref_cols = ["fulladdress", "Reference file"] joined_ref_cols.extend(new_join_col) print("joined_ref_cols: ", joined_ref_cols) # Keep only columns that exist in reference dataset joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns] match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1) # Convert long keys to string to avoid data loss match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str") match_results_output[new_join_col] = match_results_output[new_join_col].astype("string") match_results_output["standardised_address"] = standardise match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True) return match_results_output, diag_shortlist, diag_best_match def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame: ''' Create a shortlist of the best matches from a list of suggested matches ''' ## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col) results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col) diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])] # Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher #diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True ### Count number of numbers in search string # Using .loc diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d') diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0) # Replace fuzzy_score_match values for addresses with no numbers in them diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False # If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good if blocker_col == "Street": diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1) # Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close. #print(diag_shortlist.columns) diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"]) diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference']) diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference']) diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference']) diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference']) diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference']) # Full number match is currently considered only a match between property number and flat number diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\ (diag_shortlist["flat_number_match"] == True) &\ (diag_shortlist["room_number_match"] == True) &\ (diag_shortlist["block_number_match"] == True) &\ (diag_shortlist["unit_number_match"] == True) &\ (diag_shortlist["house_court_name_match"] == True) ### Postcodes need to be close together, so all the characters should match apart from the last two diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2] diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\ (diag_shortlist['full_number_match'] == True) &\ (diag_shortlist['close_postcode_match'] == True) diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"}) ### Dealing with tie breaks ## # Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical # fuzz.WRatio if resolve_tie_breaks == True: def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col): search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer) return search_score[0][0] diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True] diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)] if not diag_shortlist_dups.empty: diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1) diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left") if 'wratio_score' not in diag_shortlist.columns: diag_shortlist['wratio_score'] = '' # Order by best score diag_shortlist = diag_shortlist.sort_values([ search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"], ascending = [True, False, False, False, False]) return diag_shortlist def refine_export_results(results_df:PandasDataFrame, matched_df:PandasDataFrame, ref_list_df:PandasDataFrame, matched_col="fuzzy_match_search_address", ref_list_col="fuzzy_match_reference_address", final_matched_address_col="search_address_stand", final_ref_address_col="ref_address_stand", orig_matched_address_col = "full_address", orig_ref_address_col = "fulladdress", fuzzy_match_limit=fuzzy_match_limit, blocker_col="Postcode") -> PandasDataFrame: ''' This function takes a result file from the fuzzy search, then refines the 'matched results' according the score limit specified by the user and exports results list, matched and unmatched files. ''' # Rename score column results_df = results_df.rename(columns = {"score":"fuzzy_score"}) # Remove empty addresses results_df = results_df[results_df[matched_col] !=0 ] ### Join property number and flat/room number etc. onto results_df ref_list_df["ref_index"] = ref_list_df.index ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"] ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'}) results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address") ### Join on relevant details from the standardised match dataframe matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"] matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'}) results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search")) # Choose your best matches from the list of options diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col) ### Create matched results output ### # Columns for the output match_results file in order match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index', 'full_match', 'full_number_match', 'flat_number_match', 'room_number_match', 'block_number_match', 'unit_number_match', 'house_court_name_match', 'property_number_match', 'close_postcode_match', 'fuzzy_score_match', "fuzzy_score", "wratio_score", 'property_number_search', 'property_number_reference', 'flat_number_search', 'flat_number_reference', 'room_number_search', 'room_number_reference', 'block_number_search', 'block_number_reference', 'unit_number_search', 'unit_number_reference', 'house_court_name_search', 'house_court_name_reference', "search_mod_address", 'reference_mod_address', 'postcode','Postcode'] diag_shortlist = diag_shortlist[match_results_cols] # Choose best match from the shortlist that has been ordered according to score descending diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address") return diag_shortlist, diag_best_match def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame: ''' Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe. ''' match_results_output_success = match_results_output[match_results_output["full_match"]==True] # If you're joining to the original df on index you will need to recreate the index again match_results_output_success = match_results_output_success.rename(columns={ "reference_orig_address":"Reference matched address", "full_match":"Matched with reference address", 'uprn':'UPRN' }, errors="ignore") ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field] ref_df_after_stand_cols.extend(new_join_col) if (search_df_key_field == "index"): # Check index is int print("Search df key field is index") #match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int) results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y')) else: results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y')) # If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column if "Reference matched address_y" in results_for_orig_df_join.columns: results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False) if "Matched with reference address_y" in results_for_orig_df_join.columns: results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address'])) #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False) if "Reference file_y" in results_for_orig_df_join.columns: results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False) if "UPRN_y" in results_for_orig_df_join.columns: results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False) # Drop columns that aren't useful results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address", "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore") # Replace blanks with NA, fix UPRNs results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True) results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False) # Replace cells with only 'nan' with blank results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True) return results_for_orig_df_join