Spaces:

seanpedrickcase
/

address_matcher

Running

App Files Files Community

address_matcher / tools /fuzzy_match.py

seanpedrickcase

Updated packages, change to ref file batch size to avoid excess memory usage

ac346a9 27 days ago

raw

history blame contribute delete

24.8 kB

	import pandas as pd
	import numpy as np
	from typing import Dict, List, Tuple, Type
	from datetime import datetime
	from rapidfuzz import fuzz, process
	import gradio as gr

	PandasDataFrame = Type[pd.DataFrame]
	PandasSeries = Type[pd.Series]
	MatchedResults = Dict[str,Tuple[str,int]]
	array = List[str]

	today = datetime.now().strftime("%d%m%Y")
	today_rev = datetime.now().strftime("%Y%m%d")

	from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit

	def string_match_array(to_match:array, choices:array,
	index_name:str, matched_name:str) -> PandasDataFrame:

	temp = {name: process.extractOne(name,choices)
	for name in to_match}

	return _create_frame(matched_results=temp, index_name=index_name,
	matched_name=matched_name)

	# Fuzzy match algorithm
	def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):

	results = []

	for orig_index, orig_string in df[orig_match_address_series].items():

	predict_string = df[pred_match_address_series][orig_index]

	if (orig_string == '') and (predict_string == ''):
	results.append(np.nan)

	else:
	fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
	results.append(fuzz_score[0][1])

	new_result_col_score = (orig_match_address_series + "_fuzz_score")
	new_result_col_match = (orig_match_address_series + "_fuzz_match")

	df[new_result_col_score] = results
	df[new_result_col_match] = df[new_result_col_score] >= match_score
	#df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
	df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan

	return df

	def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
	search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
	'''
	Matches by Series values; for example idx is post code and
	values address. Search field is reduced by comparing same post codes address reference_address_series.

	Default scorer is fuzz.Wratio. This tries to weight the different algorithms
	to give the best score.
	Choice of ratio type seems to make a big difference. Looking at this link:
	https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
	and this one:
	https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings

	'''

	def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:

	def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):

	# Create a list to store the results
	results = []

	# Iterate through the matched dataframe and store results in the list
	for i, search_address in enumerate(search_addresses):
	for j, reference_address in enumerate(reference_addresses):
	score = matched[i][j]
	results.append((postcode_match, search_address, reference_address, score))

	# Create a dataframe from the results list
	matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])

	return matched_out

	try:
	if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
	matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)

	# Transform results into a dataframe
	matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

	else: # 1+ addresses
	matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)

	# Transform results into a dataframe
	matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

	# Sort the matched results by score in descending order
	matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)

	# Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
	#matched_out = matched_out.head(search_limit)

	except KeyError:
	matched_out = pd.DataFrame()

	return matched_out

	def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:

	try:
	matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
	return matched
	except KeyError:
	matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
	return matched

	print("Fuzzy match column length: ", len(match_address_series))
	print("Fuzzy Reference column length: ", len(reference_address_series))

	match_address_series = match_address_series.rename_axis('postcode_search')
	match_address_df = pd.DataFrame(match_address_series.reset_index())
	match_address_df['index'] = list(range(0,len(match_address_df)))

	reference_address_series = reference_address_series.rename_axis('postcode_search')
	reference_address_df = pd.DataFrame(reference_address_series.reset_index())
	reference_address_df['index'] = list(range(0,len(reference_address_df)))


	# Apply the match functions to each address
	scorer = getattr(fuzz, scorer_name)
	results = {}
	#counter = 0

	index_list = []
	match_list = []
	search_addresses_list = []
	reference_addresses_list = []

	unique_postcodes = pd.unique(match_address_df['postcode_search'])

	for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):

	postcode_match_list = [postcode_match]
	search_indexes = pd.Series()
	search_addresses = pd.Series()
	reference_addresses = pd.Series()

	try:
	search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
	search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
	reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]

	if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
	reference_addresses = pd.Series(reference_addresses)
	except KeyError:
	reference_addresses = pd.Series("NA")

	matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)

	# Write to output lists
	match_list.extend([matched])
	index_list.extend(search_indexes.tolist())
	search_addresses_list.extend(search_addresses.tolist())
	reference_addresses_list.extend(reference_addresses.tolist())

	out_frame = pd.concat(match_list)

	return out_frame

	def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):

	'''
	Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
	'''

	## Diagnostics

	diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
	matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
	fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)

	## Fuzzy search results
	match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
	'full_match',
	'full_number_match',
	'flat_number_match',
	'room_number_match',
	'block_number_match',
	'unit_number_match',
	'property_number_match',
	'close_postcode_match',
	'house_court_name_match',
	'fuzzy_score_match',
	"fuzzy_score",
	"wratio_score",
	'property_number_search', 'property_number_reference',
	'flat_number_search', 'flat_number_reference',
	'room_number_search', 'room_number_reference',
	'unit_number_search', 'unit_number_reference',
	'block_number_search', 'block_number_reference',
	'house_court_name_search', 'house_court_name_reference',
	"search_mod_address", 'reference_mod_address','Postcode']

	# Join results data onto the original housing list to create the full output
	search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]

	match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
	diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")

	match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})

	# Join UPRN back onto the data from reference data
	joined_ref_cols = ["fulladdress", "Reference file"]
	joined_ref_cols.extend(new_join_col)

	#print("joined_ref_cols: ", joined_ref_cols)
	# Keep only columns that exist in reference dataset
	joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]

	match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)

	# Convert long keys to string to avoid data loss
	match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
	match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
	match_results_output["standardised_address"] = standardise

	match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)

	return match_results_output, diag_shortlist, diag_best_match

	def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
	'''
	Create a shortlist of the best matches from a list of suggested matches
	'''

	## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
	results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)

	results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)

	diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]

	# Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
	#diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
	diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True

	### Count number of numbers in search string
	# Using .loc
	diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
	diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)


	# Replace fuzzy_score_match values for addresses with no numbers in them
	diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
	diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False

	# If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
	if blocker_col == "Street":
	diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False

	diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)

	# Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
	#print(diag_shortlist.columns)
	diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
	diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
	diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
	diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
	diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
	diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])

	# Full number match is currently considered only a match between property number and flat number

	diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
	(diag_shortlist["flat_number_match"] == True) &\
	(diag_shortlist["room_number_match"] == True) &\
	(diag_shortlist["block_number_match"] == True) &\
	(diag_shortlist["unit_number_match"] == True) &\
	(diag_shortlist["house_court_name_match"] == True)


	### Postcodes need to be close together, so all the characters should match apart from the last two
	diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]


	diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
	(diag_shortlist['full_number_match'] == True) &\
	(diag_shortlist['close_postcode_match'] == True)

	diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})

	### Dealing with tie breaks ##
	# Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
	# fuzz.WRatio
	if resolve_tie_breaks == True:
	def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
	search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
	return search_score[0][0]

	diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
	diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]

	if not diag_shortlist_dups.empty:
	diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)

	diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")

	if 'wratio_score' not in diag_shortlist.columns:
	diag_shortlist['wratio_score'] = ''

	# Order by best score
	diag_shortlist = diag_shortlist.sort_values([
	search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
	ascending = [True, False, False, False, False])

	return diag_shortlist

	def refine_export_results(results_df:PandasDataFrame,
	matched_df:PandasDataFrame,
	ref_list_df:PandasDataFrame,
	matched_col="fuzzy_match_search_address",
	ref_list_col="fuzzy_match_reference_address",
	final_matched_address_col="search_address_stand",
	final_ref_address_col="ref_address_stand",
	orig_matched_address_col = "full_address",
	orig_ref_address_col = "fulladdress",
	fuzzy_match_limit=fuzzy_match_limit,
	blocker_col="Postcode") -> PandasDataFrame:
	'''
	This function takes a result file from the fuzzy search, then refines the 'matched results' according
	the score limit specified by the user and exports results list, matched and unmatched files.
	'''

	# Rename score column
	results_df = results_df.rename(columns = {"score":"fuzzy_score"})

	# Remove empty addresses
	results_df = results_df[results_df[matched_col] !=0 ]

	### Join property number and flat/room number etc. onto results_df
	ref_list_df["ref_index"] = ref_list_df.index
	ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
	ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})

	results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")


	### Join on relevant details from the standardised match dataframe
	matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
	matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})

	results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))

	# Choose your best matches from the list of options
	diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)

	### Create matched results output ###
	# Columns for the output match_results file in order
	match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
	'full_match',
	'full_number_match',
	'flat_number_match',
	'room_number_match',
	'block_number_match',
	'unit_number_match',
	'house_court_name_match',
	'property_number_match',
	'close_postcode_match',
	'fuzzy_score_match',
	"fuzzy_score",
	"wratio_score",
	'property_number_search', 'property_number_reference',
	'flat_number_search', 'flat_number_reference',
	'room_number_search', 'room_number_reference',
	'block_number_search', 'block_number_reference',
	'unit_number_search', 'unit_number_reference',
	'house_court_name_search', 'house_court_name_reference',
	"search_mod_address", 'reference_mod_address', 'postcode','Postcode']

	diag_shortlist = diag_shortlist[match_results_cols]

	# Choose best match from the shortlist that has been ordered according to score descending
	diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")

	return diag_shortlist, diag_best_match

	def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
	'''
	Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
	'''
	match_results_output_success = match_results_output[match_results_output["full_match"]==True]

	# If you're joining to the original df on index you will need to recreate the index again

	match_results_output_success = match_results_output_success.rename(columns={
	"reference_orig_address":"Reference matched address",
	"full_match":"Matched with reference address",
	'uprn':'UPRN'
	}, errors="ignore")

	ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
	ref_df_after_stand_cols.extend(new_join_col)


	if (search_df_key_field == "index"):
	# Check index is int
	print("Search df key field is index")
	#match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)
	results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))
	else:
	results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))

	# If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column

	if "Reference matched address_y" in results_for_orig_df_join.columns:
	results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)

	if "Matched with reference address_y" in results_for_orig_df_join.columns:
	results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))

	#results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)

	if "Reference file_y" in results_for_orig_df_join.columns:
	results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)

	if "UPRN_y" in results_for_orig_df_join.columns:
	results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)

	# Drop columns that aren't useful
	results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
	"address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")

	# Replace blanks with NA, fix UPRNs
	results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)

	results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)

	# Replace cells with only 'nan' with blank
	results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)


	return results_for_orig_df_join