address_matcher / tools /fuzzy_match.py
seanpedrickcase's picture
Some code rearranged. Fixed API call paths for Linux systems
4300019
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Type
from datetime import datetime
from rapidfuzz import fuzz, process
import gradio as gr
PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]
today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")
from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit
def string_match_array(to_match:array, choices:array,
index_name:str, matched_name:str) -> PandasDataFrame:
temp = {name: process.extractOne(name,choices)
for name in to_match}
return _create_frame(matched_results=temp, index_name=index_name,
matched_name=matched_name)
# Fuzzy match algorithm
def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):
results = []
for orig_index, orig_string in df[orig_match_address_series].items():
predict_string = df[pred_match_address_series][orig_index]
if (orig_string == '') and (predict_string == ''):
results.append(np.nan)
else:
fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
results.append(fuzz_score[0][1])
new_result_col_score = (orig_match_address_series + "_fuzz_score")
new_result_col_match = (orig_match_address_series + "_fuzz_match")
df[new_result_col_score] = results
df[new_result_col_match] = df[new_result_col_score] >= match_score
#df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
return df
def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
'''
Matches by Series values; for example idx is post code and
values address. Search field is reduced by comparing same post codes address reference_address_series.
Default scorer is fuzz.Wratio. This tries to weight the different algorithms
to give the best score.
Choice of ratio type seems to make a big difference. Looking at this link:
https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
and this one:
https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
'''
def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:
def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):
# Create a list to store the results
results = []
# Iterate through the matched dataframe and store results in the list
for i, search_address in enumerate(search_addresses):
for j, reference_address in enumerate(reference_addresses):
score = matched[i][j]
results.append((postcode_match, search_address, reference_address, score))
# Create a dataframe from the results list
matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])
return matched_out
try:
if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
# Transform results into a dataframe
matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
else: # 1+ addresses
matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
# Transform results into a dataframe
matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
# Sort the matched results by score in descending order
matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)
# Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
#matched_out = matched_out.head(search_limit)
except KeyError:
matched_out = pd.DataFrame()
return matched_out
def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
try:
matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
return matched
except KeyError:
matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
return matched
print("Fuzzy match column length: ", len(match_address_series))
print("Fuzzy Reference column length: ", len(reference_address_series))
match_address_series = match_address_series.rename_axis('postcode_search')
match_address_df = pd.DataFrame(match_address_series.reset_index())
match_address_df['index'] = list(range(0,len(match_address_df)))
reference_address_series = reference_address_series.rename_axis('postcode_search')
reference_address_df = pd.DataFrame(reference_address_series.reset_index())
reference_address_df['index'] = list(range(0,len(reference_address_df)))
# Apply the match functions to each address
scorer = getattr(fuzz, scorer_name)
results = {}
#counter = 0
index_list = []
match_list = []
search_addresses_list = []
reference_addresses_list = []
unique_postcodes = pd.unique(match_address_df['postcode_search'])
for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
postcode_match_list = [postcode_match]
search_indexes = pd.Series()
search_addresses = pd.Series()
reference_addresses = pd.Series()
try:
search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]
if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
reference_addresses = pd.Series(reference_addresses)
except KeyError:
reference_addresses = pd.Series("NA")
matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)
# Write to output lists
match_list.extend([matched])
index_list.extend(search_indexes.tolist())
search_addresses_list.extend(search_addresses.tolist())
reference_addresses_list.extend(reference_addresses.tolist())
out_frame = pd.concat(match_list)
return out_frame
def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):
'''
Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
'''
## Diagnostics
diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
## Fuzzy search results
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
'full_match',
'full_number_match',
'flat_number_match',
'room_number_match',
'block_number_match',
'unit_number_match',
'property_number_match',
'close_postcode_match',
'house_court_name_match',
'fuzzy_score_match',
"fuzzy_score",
"wratio_score",
'property_number_search', 'property_number_reference',
'flat_number_search', 'flat_number_reference',
'room_number_search', 'room_number_reference',
'unit_number_search', 'unit_number_reference',
'block_number_search', 'block_number_reference',
'house_court_name_search', 'house_court_name_reference',
"search_mod_address", 'reference_mod_address','Postcode']
# Join results data onto the original housing list to create the full output
search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]
match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
# Join UPRN back onto the data from reference data
joined_ref_cols = ["fulladdress", "Reference file"]
joined_ref_cols.extend(new_join_col)
print("joined_ref_cols: ", joined_ref_cols)
# Keep only columns that exist in reference dataset
joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
# Convert long keys to string to avoid data loss
match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
match_results_output["standardised_address"] = standardise
match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
return match_results_output, diag_shortlist, diag_best_match
def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
'''
Create a shortlist of the best matches from a list of suggested matches
'''
## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)
results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)
diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]
# Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
#diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True
### Count number of numbers in search string
# Using .loc
diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)
# Replace fuzzy_score_match values for addresses with no numbers in them
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False
# If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
if blocker_col == "Street":
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)
# Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
#print(diag_shortlist.columns)
diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])
# Full number match is currently considered only a match between property number and flat number
diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
(diag_shortlist["flat_number_match"] == True) &\
(diag_shortlist["room_number_match"] == True) &\
(diag_shortlist["block_number_match"] == True) &\
(diag_shortlist["unit_number_match"] == True) &\
(diag_shortlist["house_court_name_match"] == True)
### Postcodes need to be close together, so all the characters should match apart from the last two
diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
(diag_shortlist['full_number_match'] == True) &\
(diag_shortlist['close_postcode_match'] == True)
diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})
### Dealing with tie breaks ##
# Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
# fuzz.WRatio
if resolve_tie_breaks == True:
def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
return search_score[0][0]
diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]
if not diag_shortlist_dups.empty:
diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
if 'wratio_score' not in diag_shortlist.columns:
diag_shortlist['wratio_score'] = ''
# Order by best score
diag_shortlist = diag_shortlist.sort_values([
search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
ascending = [True, False, False, False, False])
return diag_shortlist
def refine_export_results(results_df:PandasDataFrame,
matched_df:PandasDataFrame,
ref_list_df:PandasDataFrame,
matched_col="fuzzy_match_search_address",
ref_list_col="fuzzy_match_reference_address",
final_matched_address_col="search_address_stand",
final_ref_address_col="ref_address_stand",
orig_matched_address_col = "full_address",
orig_ref_address_col = "fulladdress",
fuzzy_match_limit=fuzzy_match_limit,
blocker_col="Postcode") -> PandasDataFrame:
'''
This function takes a result file from the fuzzy search, then refines the 'matched results' according
the score limit specified by the user and exports results list, matched and unmatched files.
'''
# Rename score column
results_df = results_df.rename(columns = {"score":"fuzzy_score"})
# Remove empty addresses
results_df = results_df[results_df[matched_col] !=0 ]
### Join property number and flat/room number etc. onto results_df
ref_list_df["ref_index"] = ref_list_df.index
ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")
### Join on relevant details from the standardised match dataframe
matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
# Choose your best matches from the list of options
diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
### Create matched results output ###
# Columns for the output match_results file in order
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
'full_match',
'full_number_match',
'flat_number_match',
'room_number_match',
'block_number_match',
'unit_number_match',
'house_court_name_match',
'property_number_match',
'close_postcode_match',
'fuzzy_score_match',
"fuzzy_score",
"wratio_score",
'property_number_search', 'property_number_reference',
'flat_number_search', 'flat_number_reference',
'room_number_search', 'room_number_reference',
'block_number_search', 'block_number_reference',
'unit_number_search', 'unit_number_reference',
'house_court_name_search', 'house_court_name_reference',
"search_mod_address", 'reference_mod_address', 'postcode','Postcode']
diag_shortlist = diag_shortlist[match_results_cols]
# Choose best match from the shortlist that has been ordered according to score descending
diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
return diag_shortlist, diag_best_match
def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
'''
Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
'''
match_results_output_success = match_results_output[match_results_output["full_match"]==True]
# If you're joining to the original df on index you will need to recreate the index again
match_results_output_success = match_results_output_success.rename(columns={
"reference_orig_address":"Reference matched address",
"full_match":"Matched with reference address",
'uprn':'UPRN'
}, errors="ignore")
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
ref_df_after_stand_cols.extend(new_join_col)
if (search_df_key_field == "index"):
# Check index is int
print("Search df key field is index")
#match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)
results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))
else:
results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))
# If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column
if "Reference matched address_y" in results_for_orig_df_join.columns:
results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)
if "Matched with reference address_y" in results_for_orig_df_join.columns:
results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
#results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
if "Reference file_y" in results_for_orig_df_join.columns:
results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
if "UPRN_y" in results_for_orig_df_join.columns:
results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
# Drop columns that aren't useful
results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
"address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
# Replace blanks with NA, fix UPRNs
results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
# Replace cells with only 'nan' with blank
results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
return results_for_orig_df_join