File size: 24,757 Bytes
dd1cbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4300019
 
 
 
 
dd1cbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36bca81
 
 
 
dd1cbb4
 
 
 
 
 
 
 
 
 
 
4300019
dd1cbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36bca81
dd1cbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Type
from datetime import datetime
from rapidfuzz import fuzz, process
import gradio as gr

PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]

today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")

from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit

def string_match_array(to_match:array, choices:array,
                      index_name:str, matched_name:str) -> PandasDataFrame:
    
    temp = {name: process.extractOne(name,choices) 
            for name in to_match}
    
    return _create_frame(matched_results=temp, index_name=index_name,
                        matched_name=matched_name)

# Fuzzy match algorithm
def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):

    results = []

    for orig_index, orig_string in df[orig_match_address_series].items():
        
        predict_string = df[pred_match_address_series][orig_index] 
        
        if (orig_string == '') and (predict_string == ''):
            results.append(np.nan)
            
        else:
            fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
            results.append(fuzz_score[0][1])

    new_result_col_score = (orig_match_address_series + "_fuzz_score")
    new_result_col_match = (orig_match_address_series + "_fuzz_match") 

    df[new_result_col_score] = results
    df[new_result_col_match] = df[new_result_col_score] >= match_score
    #df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
    df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
    
    return df

def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
                              search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
    '''
    Matches by Series values; for example idx is post code and 
    values address. Search field is reduced by comparing same post codes address reference_address_series.
    
    Default scorer is fuzz.Wratio. This tries to weight the different algorithms
    to give the best score.
    Choice of ratio type seems to make a big difference. Looking at this link:
    https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
    and this one: 
    https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings    

    '''

    def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:

        def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):

            # Create a list to store the results
            results = []

            # Iterate through the matched dataframe and store results in the list
            for i, search_address in enumerate(search_addresses):
                for j, reference_address in enumerate(reference_addresses):
                    score = matched[i][j]
                    results.append((postcode_match, search_address, reference_address, score))

            # Create a dataframe from the results list
            matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])

            return matched_out

        try:
            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            else:  # 1+ addresses
                matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1) 

                # Transform results into a dataframe
                matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)

            # Sort the matched results by score in descending order
            matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)

            # Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
            #matched_out = matched_out.head(search_limit)

        except KeyError:
            matched_out = pd.DataFrame()

        return matched_out
 
    def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
        
        try:
            matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
            return matched
        except KeyError:
            matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
            return matched

    print("Fuzzy match column length: ", len(match_address_series))
    print("Fuzzy Reference column length: ", len(reference_address_series))

    match_address_series = match_address_series.rename_axis('postcode_search')
    match_address_df = pd.DataFrame(match_address_series.reset_index())
    match_address_df['index'] = list(range(0,len(match_address_df)))

    reference_address_series = reference_address_series.rename_axis('postcode_search')
    reference_address_df = pd.DataFrame(reference_address_series.reset_index())
    reference_address_df['index'] = list(range(0,len(reference_address_df)))

    
    # Apply the match functions to each address
    scorer = getattr(fuzz, scorer_name)                  
    results = {}
    #counter = 0

    index_list = []
    match_list = []
    search_addresses_list = []
    reference_addresses_list = []

    unique_postcodes = pd.unique(match_address_df['postcode_search'])

    for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):

        postcode_match_list = [postcode_match]
        search_indexes = pd.Series()
        search_addresses = pd.Series()
        reference_addresses = pd.Series()

        try:
            search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
            search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
            reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]

            if isinstance(reference_addresses, str):  # reference_addresses can be a str-> 1 address per postcode
                reference_addresses = pd.Series(reference_addresses)
        except KeyError:
            reference_addresses = pd.Series("NA")

        matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)

        # Write to output lists
        match_list.extend([matched])
        index_list.extend(search_indexes.tolist())
        search_addresses_list.extend(search_addresses.tolist())
        reference_addresses_list.extend(reference_addresses.tolist())

    out_frame = pd.concat(match_list)

    return out_frame

def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):

        '''
        Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
        '''

        ## Diagnostics

        diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
                                      matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
                                      fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
        
        ## Fuzzy search results

        match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'property_number_match',
        'close_postcode_match',
        'house_court_name_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'unit_number_search', 'unit_number_reference',
        'block_number_search', 'block_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address','Postcode']

        # Join results data onto the original housing list to create the full output
        search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]

        match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
            diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
        
        match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
        
        # Join UPRN back onto the data from reference data
        joined_ref_cols = ["fulladdress", "Reference file"]
        joined_ref_cols.extend(new_join_col)

        print("joined_ref_cols: ", joined_ref_cols)
        # Keep only columns that exist in reference dataset
        joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]

        match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)

        # Convert long keys to string to avoid data loss
        match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
        match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
        match_results_output["standardised_address"] = standardise
    
        match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
                
        return match_results_output, diag_shortlist, diag_best_match

def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
    '''
    Create a shortlist of the best matches from a list of suggested matches
    '''

    ## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
    results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)

    results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)

    diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]

    # Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
    #diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
    diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True

    ### Count number of numbers in search string
    # Using .loc
    diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
    diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)


    # Replace fuzzy_score_match values for addresses with no numbers in them
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
    diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False

    # If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
    if blocker_col == "Street":
        diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
                            
    diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)

    # Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
    #print(diag_shortlist.columns) 
    diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
    diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
    diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
    diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
    diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
    diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])

    # Full number match is currently considered only a match between property number and flat number
                            
    diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
        (diag_shortlist["flat_number_match"] == True) &\
        (diag_shortlist["room_number_match"] == True) &\
        (diag_shortlist["block_number_match"] == True) &\
        (diag_shortlist["unit_number_match"] == True) &\
        (diag_shortlist["house_court_name_match"] == True)

    
    ### Postcodes need to be close together, so all the characters should match apart from the last two 
    diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
        
    
    diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
        (diag_shortlist['full_number_match'] == True) &\
        (diag_shortlist['close_postcode_match'] == True)
    
    diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})

    ### Dealing with tie breaks ##
    # Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
    # fuzz.WRatio
    if resolve_tie_breaks == True:
        def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
            search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
            return search_score[0][0]

        diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
        diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]

        if not diag_shortlist_dups.empty:
            diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
                                
            diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")

    if 'wratio_score' not in diag_shortlist.columns:
        diag_shortlist['wratio_score'] = '' 

    # Order by best score
    diag_shortlist = diag_shortlist.sort_values([
        search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
        ascending = [True, False, False, False, False])          

    return diag_shortlist

def refine_export_results(results_df:PandasDataFrame, 
                           matched_df:PandasDataFrame,
                           ref_list_df:PandasDataFrame,
                           matched_col="fuzzy_match_search_address",
                           ref_list_col="fuzzy_match_reference_address",
                           final_matched_address_col="search_address_stand",
                           final_ref_address_col="ref_address_stand",
                           orig_matched_address_col = "full_address",
                           orig_ref_address_col = "fulladdress",
                           fuzzy_match_limit=fuzzy_match_limit,
                           blocker_col="Postcode") -> PandasDataFrame:
    '''
    This function takes a result file from the fuzzy search, then refines the 'matched results' according
    the score limit specified by the user and exports results list, matched and unmatched files.
    '''
       
    # Rename score column
    results_df = results_df.rename(columns = {"score":"fuzzy_score"})
          
    # Remove empty addresses
    results_df = results_df[results_df[matched_col] !=0 ]

    ### Join property number and flat/room number etc. onto results_df
    ref_list_df["ref_index"] = ref_list_df.index
    ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
    ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})

    results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")


    ### Join on relevant details from the standardised match dataframe
    matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
    matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
    
    results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
    
    # Choose your best matches from the list of options
    diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)

    ### Create matched results output ###
    # Columns for the output match_results file in order
    match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
        'full_match',
        'full_number_match',
        'flat_number_match',
        'room_number_match',
        'block_number_match',
        'unit_number_match',
        'house_court_name_match',
        'property_number_match',
        'close_postcode_match',
        'fuzzy_score_match',
        "fuzzy_score",
        "wratio_score",
        'property_number_search', 'property_number_reference',  
        'flat_number_search', 'flat_number_reference', 
        'room_number_search', 'room_number_reference',
        'block_number_search', 'block_number_reference',
        'unit_number_search', 'unit_number_reference',
        'house_court_name_search', 'house_court_name_reference',
        "search_mod_address", 'reference_mod_address', 'postcode','Postcode']

    diag_shortlist = diag_shortlist[match_results_cols]

    # Choose best match from the shortlist that has been ordered according to score descending
    diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
   
    return diag_shortlist, diag_best_match

def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
    ''' 
    Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
    '''
    match_results_output_success = match_results_output[match_results_output["full_match"]==True]

    # If you're joining to the original df on index you will need to recreate the index again 

    match_results_output_success = match_results_output_success.rename(columns={
                                        "reference_orig_address":"Reference matched address",
                                        "full_match":"Matched with reference address",
                                        'uprn':'UPRN'                                                                             
                                     }, errors="ignore")
    
    ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
    ref_df_after_stand_cols.extend(new_join_col)
 
    
    if (search_df_key_field == "index"):
        # Check index is int
        print("Search df key field is index")
        #match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)     
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))  
    else:
        results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))

    # If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column

    if "Reference matched address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)

    if "Matched with reference address_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))

        #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)

    if "Reference file_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)

    if "UPRN_y" in results_for_orig_df_join.columns: 
        results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)

    # Drop columns that aren't useful
    results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
                                   "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")

    # Replace blanks with NA, fix UPRNs
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)   

    results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
    
    # Replace cells with only 'nan' with blank
    results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
  
    
    return results_for_orig_df_join