File size: 37,473 Bytes
dd1cbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
import pandas as pd
import numpy as np
import re
from typing import Type, Dict, List, Tuple
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')

PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]

today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")

# # Standardisation functions

def standardise_wrapper_func(search_df_cleaned:PandasDataFrame, ref_df_cleaned:PandasDataFrame,\
                               standardise = False, filter_to_lambeth_pcodes = True, match_task = "fuzzy"):
    '''
    Initial standardisation of search and reference dataframes before passing addresses and postcodes to the main standardisation function
    '''
    
    ## Search df - lower case addresses, replace spaces in postcode
    search_df_cleaned["full_address_search"] = search_df_cleaned["full_address"].str.lower().str.strip()
    search_df_cleaned['postcode_search'] = search_df_cleaned['postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)

    # Filter out records where 'Excluded from search' is not a postal address by making the postcode blank
    search_df_cleaned.loc[search_df_cleaned['Excluded from search'] == "Excluded - non-postal address", 'postcode_search'] = ""

    # Remove nulls from ref_df postcode
    ref_df_cleaned = ref_df_cleaned[ref_df_cleaned['Postcode'].notna()]
    
    ref_df_cleaned["full_address_search"] = ref_df_cleaned["fulladdress"].str.lower().str.strip()
    ref_df_cleaned['postcode_search'] = ref_df_cleaned['Postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)
    
    # Block only on first 5 characters of postcode string - Doesn't give more matches and makes everything a bit slower
    #search_df_cleaned['postcode_search'] = search_df_cleaned['postcode_search'].str[:-1]
    #ref_df_cleaned['postcode_search'] = ref_df_cleaned['postcode_search'].str[:-1]


    ### Use standardise function

    ### Remove 'non-housing' places from the list - not included as want to check all
    #search_df_after_stand = remove_non_housing(search_df_cleaned, 'full_address_search')
    search_df_after_stand = standardise_address(search_df_cleaned, "full_address_search", "search_address_stand", standardise = standardise, out_london = True)

    ## Standardise ref_df addresses

    if match_task == "fuzzy":
        ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = standardise, out_london = True)
    else:
        # For the neural net matching, I didn't find that standardising the reference addresses helped at all, in fact it made things worse. So reference addresses are not standardised at this step.
        ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = False, out_london = True)


    return search_df_after_stand, ref_df_after_stand#, search_df_after_stand_series, ref_df_after_stand_series

def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bool = True, out_london = True) -> PandasDataFrame:
    
    ''' 
    This function takes a 'full address' column and then standardises so that extraneous
    information is removed (i.e. postcodes & London, as this algorithm is used for London
    addresses only), and so that room/flat/property numbers can be accurately extracted. The
    standardised addresses can then be used for the fuzzy matching functions later in this 
    notebook.
    
    The function does the following:
    
    - Removes the post code and 'london' (if not dealing with addresses outside of london)
      from the address to reduce the text the algorithm has to search.
      Postcode removal uses regex to extract a UK postcode.
      
    - Remove the word 'flat' or 'apartment' from an address that has only one number in it
    
    - Add 'flat' to the start of any address that contains 'house' or 'court' (which are generally housing association buildings)
      This is because in the housing list, these addresses never have the word flat in front of them

    - Replace any addresses that don't have a space between the comma and the next word or double spaces
    
    - Replace 'number / number' and 'number-number' with 'number' (the first number in pair)
    
    - Add 'flat' to the start of addresses that include ground floor/first floor etc. flat 
      in the text. Replace with flat a,b,c etc.
    
    - Pull out property, flat, and room numbers from the address text
    
    - Return the data frame with the new columns included
    
    '''
   
    df_copy = df.copy(deep=True)
    
    # Trim the address to remove leading and tailing spaces
    df_copy[col] = df_copy[col].str.strip()
    
    ''' Remove the post code and 'london' from the address to reduce the text the algorithm has to search
    Using a regex to extract a UK postcode. I got the regex from the following. Need to replace their \b in the solution with \\b
    https://stackoverflow.com/questions/51828712/r-regular-expression-for-extracting-uk-postcode-from-an-address-is-not-ordered
        
    The following will pick up whole postcodes, postcodes with just the first part, and postcodes with the first
    part and the first number of the second half
    '''
    
    
    df_copy['add_no_pcode'] = remove_postcode(df_copy, col)
    
    if out_london == False:
        df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace("london","").str.replace(r",,|, ,","", regex=True)
    
    # If the user wants to standardise the address
    if standardise:
        
        df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.lower()

        # If there are dates at the start of the address, change this
        df_copy['add_no_pcode'] = replace_mistaken_dates(df_copy, 'add_no_pcode')

        # Replace flat name variations with flat, abbreviations with full name of item (e.g. rd to road)
        df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace(r"\brd\b","road", regex=True).\
                                                str.replace(r"\bst\b","street", regex=True).\
                                                str.replace(r"\bave\b","avenue", regex=True).\
                                                str.replace("'", "", regex=False).\
                                                str.replace(r"\bat\b ", " ",regex=True).\
                                                str.replace("apartment", "flat",regex=False).\
                                                str.replace("studio flat", "flat",regex=False).\
                                                str.replace("cluster flat", "flats",regex=False).\
                                                str.replace(r"\bflr\b", "floor", regex=True).\
                                                str.replace(r"\bflrs\b", "floors", regex=True).\
                                                str.replace(r"\blwr\b", "lower", regex=True).\
                                                str.replace(r"\bgnd\b", "ground", regex=True).\
                                                str.replace(r"\blgnd\b", "lower ground", regex=True).\
                                                str.replace(r"\bgrd\b", "ground", regex=True).\
                                                str.replace(r"\bmais\b", "flat", regex=True).\
                                                str.replace(r"\bmaisonette\b", "flat", regex=True).\
                                                str.replace(r"\bpt\b", "penthouse", regex=True).\
                                                str.replace(r"\bbst\b","basement", regex=True).\
                                                str.replace(r"\bbsmt\b","basement", regex=True)
        
        df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
    
        # Replace any addresses that don't have a space between the comma and the next word. and double spaces # df_copy['add_no_pcode_house']
        df_copy['add_no_pcode_house_comma'] = df_copy['add_no_pcode_house'].str.replace(r',(\w)', r', \1', regex=True).str.replace('  ', ' ', regex=False)

        # Replace number / number and number-number with number
        df_copy['add_no_pcode_house_comma_no'] = df_copy['add_no_pcode_house_comma'].str.replace(r'(\d+)\/(\d+)', r'\1', regex=True\
                        ).str.replace(r'(\d+)-(\d+)', r'\1', regex=True\
                        ).str.replace(r'(\d+) - (\d+)', r'\1', regex=True)

        # Add 'flat' to the start of addresses that include ground/first/second etc. floor flat in the text
        df_copy['floor_replacement'] = replace_floor_flat(df_copy, 'add_no_pcode_house_comma_no')
        df_copy['flat_added_to_start_addresses_begin_letter'] = add_flat_addresses_start_with_letter(df_copy, 'floor_replacement')

        df_copy[out_col] = merge_series(df_copy['add_no_pcode_house_comma_no'], df_copy['flat_added_to_start_addresses_begin_letter'])

        # Write stuff back to the original df
        df[out_col] = df_copy[out_col]
    
    else:
        df_copy[out_col] = df_copy['add_no_pcode']
        df[out_col] = df_copy['add_no_pcode']

    ## POST STANDARDISATION CLEANING AND INFORMATION EXTRACTION  
    # Remove trailing spaces
    df[out_col] = df[out_col].str.strip()
    
    # Pull out property, flat, and room numbers from the address text
    df['property_number'] = extract_prop_no(df_copy, out_col)

    # Extract flat, apartment numbers
    df = extract_flat_and_other_no(df, out_col)

    df['flat_number'] = merge_series(df['flat_number'], df['apart_number'])
    df['flat_number'] = merge_series(df['flat_number'], df['prop_number'])
    df['flat_number'] = merge_series(df['flat_number'], df['first_sec_number'])
    df['flat_number'] = merge_series(df['flat_number'], df['first_letter_flat_number'])
    df['flat_number'] = merge_series(df['flat_number'], df['first_letter_no_more_numbers'])
    
    # Extract room numbers
    df['room_number'] = extract_room_no(df, out_col)

    # Extract block and unit names
    df = extract_block_and_unit_name(df, out_col)

    # Extract house or court name
    df['house_court_name'] = extract_house_or_court_name(df, out_col)

    return df

def move_flat_house_court(df:PandasDataFrame):
            ''' Remove 'flat' from any address that contains 'house' or 'court'
            From the df address, remove the word 'flat' from any address that contains the word 'house' or 'court'
            This is because in the housing list, these addresses never have the word flat in front of them
            '''
            
            # Remove the word flat or apartment from addresses that have only one number in it. 'Flat' will be re-added later to relevant addresses 
            # that need it (replace_floor_flat)
            df['flat_removed'] = remove_flat_one_number_address(df, 'add_no_pcode')
        
        
            
            remove_flat_house = df['flat_removed'].str.lower().str.contains(r"\bhouse\b")#(?=\bhouse\b)(?!.*house road)")
            remove_flat_court = df['flat_removed'].str.lower().str.contains(r"\bcourt\b")#(?=\bcourt\b)(?!.*court road)")
            remove_flat_terrace = df['flat_removed'].str.lower().str.contains(r"\bterrace\b")#(?=\bterrace\b)(?!.*terrace road)")
            remove_flat_house_or_court = (remove_flat_house | remove_flat_court | remove_flat_terrace == 1)

            df['remove_flat_house_or_court'] = remove_flat_house_or_court

            # Assuming 'df' is your DataFrame
            df = df[~df.index.duplicated(keep='first')]

            df['house_court_replacement'] = "flat " + df.loc[df['remove_flat_house_or_court'] == True, 'flat_removed'].str.replace(r"\bflat\b","", regex=True).str.strip().map(str)

            #df["add_no_pcode_house"] = merge_columns(df, "add_no_pcode_house", 'flat_removed', "house_court_replacement")

            #merge_columns(df, "new_col", col1, 'letter_after_number')
            df["add_no_pcode_house"] = merge_series(df['flat_removed'], df["house_court_replacement"])

            return df["add_no_pcode_house"]

def extract_street_name(address:str) -> str:
    """
    Extracts the street name from the given address.

    Args:
        address (str): The input address string.

    Returns:
        str: The extracted street name, or an empty string if no match is found.

    Examples:
        >>> address1 = "1 Ash Park Road SE54 3HB"
        >>> extract_street_name(address1)
        'Ash Park Road'

        >>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
        >>> extract_street_name(address2)
        'Ash Park Road'

        >>> address3 = "123 Main Blvd"
        >>> extract_street_name(address3)
        'Main Blvd'

        >>> address4 = "456 Maple AvEnUe"
        >>> extract_street_name(address4)
        'Maple AvEnUe'

        >>> address5 = "789 Oak Street"
        >>> extract_street_name(address5)
        'Oak Street'
    """
    
  
    
    street_types = [
        'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
        'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
        'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
        'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
        'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
        'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
        'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
        'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
    ]

    # Dynamically construct the regex pattern with all possible street types
    street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)

    # The overall regex pattern to capture the street name
    pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'

    def replace_postcode(address):
        pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
        return re.sub(pattern, "", address)

    
    modified_address = replace_postcode(address.upper())
    #print(modified_address)
    #print(address)
       
    # Perform a case-insensitive search
    match = re.search(pattern, modified_address, re.IGNORECASE)

    if match:
        street_name = match.group('street_name')
        return street_name.strip()
    else:
        return ""

def remove_flat_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:

    '''
    If there is only one number in the address, and there is no letter after the number,
    remove the word flat from the address
    '''
    
    df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)", regex = True)
    df['contains_single_letter_before_number'] = df[col1].str.lower().str.contains(r'\b[A-Za-z]\b[^\d]* \d', regex = True)
    df['two_numbers_in_address'] =  df[col1].str.lower().str.contains(r"(?:\d+.*?)[^a-zA-Z0-9_].*?\d+", regex = True)
    df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "", regex = True)
    df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "", regex = True)
    df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "", regex = True)
      
    
    df['selected_rows'] = (df['contains_letter_after_number'] == False) &\
                          (df['two_numbers_in_address'] == False) &\
                            (df['contains_single_letter_before_number'] == False) &\
                             ((df['contains_flat'] == True) |\
                             (df['contains_apartment'] == True) |\
                             (df['contains_room'] == True))
        
    df['one_number_no_flat'] =  df[df['selected_rows'] == True][col1]
    df['one_number_no_flat'] =  df['one_number_no_flat'].str.replace(r"(\bapartment\b)|(\bapartments\b)", "", regex=True).str.replace(r"(\bflat\b)|(\bflats\b)", "", regex=True).str.replace(r"(\broom\b)|(\brooms\b)", "", regex=True)

    df["new_col"] = merge_series(df[col1], df["one_number_no_flat"])
    
    return df['new_col']

def add_flat_addresses_start_with_letter(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Add the word flat to addresses that start with a letter.
    '''
    
    df['contains_single_letter_at_start_before_number'] = df[col1].str.lower().str.contains(r'^\b[A-Za-z]\b[^\d]* \d', regex = True)

    df['selected_rows'] = (df['contains_single_letter_at_start_before_number'] == True)
    df['flat_added_to_string_start'] =  "flat " + df[df['selected_rows'] == True][col1]
    
    #merge_columns(df, "new_col", col1, 'flat_added_to_string_start')
    df["new_col"] = merge_series(df[col1], df['flat_added_to_string_start'])
    
    
    return df['new_col']

def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    This function looks for addresses that have a letter after a number, but ONLY one number
    in the string, and doesn't already have a flat, apartment, or room number. 
        
    It then extracts this letter and returns this.
    
    This is for addresses such as '2b sycamore road', changes it to
    flat b 2 sycamore road so that 'b' is selected as the flat number

    
    '''
    
    df['contains_no_numbers_without_letter'] = df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$")
    df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)")      
    df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "")
    df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "")
    df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "")
        
    df['selected_rows'] = (df['contains_no_numbers_without_letter'] == True) &\
                             (df['contains_letter_after_number'] == True) &\
                             (df['contains_flat'] == False) &\
                             (df['contains_apartment'] == False) &\
                             (df['contains_room'] == False)
            
    df['extract_letter'] =  df[(df['selected_rows'] == True)\
                                  ][col1].str.extract(r"\d+([a-z]|[A-Z])")
    
    df['extract_number'] =  df[(df['selected_rows'] == True)\
                                  ][col1].str.extract(r"(\d+)[a-z]|[A-Z]")
    

    df['letter_after_number'] = "flat " +\
                                df[(df['selected_rows'] == True)\
                                  ]['extract_letter'] +\
                                " " +\
                                df[(df['selected_rows'] == True)\
                                  ]['extract_number'] +\
                                " " +\
                                df[(df['selected_rows'])\
                                  ][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\d+([a-z]|[A-Z])","", regex=True).map(str)

    #merge_columns(df, "new_col", col1, 'letter_after_number')
    df["new_col"] = merge_series(df[col1], df['letter_after_number'])
    
    return df['new_col']

# def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
#     '''
#     This function extracts a letter after a single number in an address, excluding cases with existing flat, apartment, or room numbers.
#     It transforms addresses like '2b sycamore road' to 'flat b 2 sycamore road' to designate 'b' as the flat number.
#     '''
    
#     df['selected_rows'] = (df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$") & \
#                            df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)") & \
#                            ~df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+|\bapartment\b \w+|\bapartments\b \w+|\broom\b \w+|\brooms\b \w+"))
    
#     df['extract_letter'] = df.loc[df['selected_rows'], col1].str.extract(r"\d+([a-z]|[A-Z])")
#     df['extract_number'] = df.loc[df['selected_rows'], col1].str.extract(r"(\d+)[a-z]|[A-Z]")
    
#     df['letter_after_number'] = "flat " + df['extract_letter'] + " " + df['extract_number'] + " " + \
#                                 df.loc[df['selected_rows'], col1].str.replace(r"\bflat\b", "", regex=True).str.replace(r"\d+([a-z]|[A-Z])", "", regex=True).map(str)
    
#     df["new_col"] = df[col1].copy()
#     df.loc[df['selected_rows'], "new_col"] = df['letter_after_number']
    
#     return df['new_col']

def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    ''' In references to basement, ground floor, first floor, second floor, and top floor
    flats, this function moves the word 'flat' to the front of the address. This is so that the
    following word (e.g. basement, ground floor) is recognised as the flat number in the 'extract_flat_and_other_no' function
    '''
    
    df['letter_after_number'] = extract_letter_one_number_address(df, col1)
       
   
    df['basement'] = "flat basement" + df[df[col1].str.lower().str.contains(r"basement"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement\b","", regex=True).map(str)
    

    df['ground_floor'] = "flat a " + df[df[col1].str.lower().str.contains(r"\bground floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground floor\b","", regex=True).map(str)

    df['first_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\bfirst floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst floor\b","", regex=True).map(str)

    df['ground_and_first_floor'] = "flat ab " + df[df[col1].str.lower().str.contains(r"\bground and first floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first floor\b","", regex=True).map(str)

    df['basement_ground_and_first_floor'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement and ground and first floors\b","", regex=True).map(str)

    df['basement_ground_and_first_floor2'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement ground and first floors\b","", regex=True).map(str)

    df['second_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\bsecond floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bsecond floor\b","", regex=True).map(str)

    df['first_and_second_floor'] = "flat bc " + df[df[col1].str.lower().str.contains(r"\bfirst and second floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst and second floor\b","", regex=True).map(str)

    df['first1_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\b1st floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b1st floor\b","", regex=True).map(str)

    df['second2_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\b2nd floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b2nd floor\b","", regex=True).map(str)

    df['ground_first_second_floor'] = "flat abc " + df[df[col1].str.lower().str.contains(r"\bground and first and second floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first and second floor\b","", regex=True).map(str)

    df['third_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\bthird floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bthird floor\b","", regex=True).map(str)

    df['third3_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\b3rd floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b3rd floor\b","", regex=True).map(str)   

    df['top_floor'] = "flat top " + df[df[col1].str.lower().str.contains(r"\btop floor\b"\
                                )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\btop floor\b","", regex=True).map(str)
    
    #merge_columns(df, "new_col", col1, 'letter_after_number')
    df["new_col"] = merge_series(df[col1], df['letter_after_number'])
    df["new_col"] = merge_series(df["new_col"], df['basement'])
    df["new_col"] = merge_series(df["new_col"], df['ground_floor'])
    df["new_col"] = merge_series(df["new_col"], df['first_floor'])
    df["new_col"] = merge_series(df["new_col"], df['first1_floor'])
    df["new_col"] = merge_series(df["new_col"], df['ground_and_first_floor'])
    df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor'])
    df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor2'])
    df["new_col"] = merge_series(df["new_col"], df['second_floor'])
    df["new_col"] = merge_series(df["new_col"], df['second2_floor'])
    df["new_col"] = merge_series(df["new_col"], df['first_and_second_floor'])
    df["new_col"] = merge_series(df["new_col"], df['ground_first_second_floor'])
    df["new_col"] = merge_series(df["new_col"], df['third_floor'])
    df["new_col"] = merge_series(df["new_col"], df['third3_floor'])
    df["new_col"] = merge_series(df["new_col"], df['top_floor'])
       
    return df['new_col']

# def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
#     '''Moves the word 'flat' to the front of addresses with floor references like basement, ground floor, etc.'''
    
#     floor_mapping = {
#         'basement': 'basement',
#         'ground floor': 'a',
#         'first floor': 'b',
#         'ground and first floor': 'ab',
#         'basement ground and first floors': 'basementab',
#         'second floor': 'c',
#         'first and second floor': 'bc',
#         '1st floor': 'b',
#         '2nd floor': 'c',
#         'ground and first and second floor': 'abc',
#         'third floor': 'd',
#         '3rd floor': 'd',
#         'top floor': 'top'
#     }
    
#     for key, value in floor_mapping.items():
#         df[key] = f"flat {value} " + df[df[col1].str.lower().str.contains(fr"\b{key}\b")][col1].str.replace(r"\bflat\b", "", regex=True).str.replace(fr"\b{key}\b", "", regex=True).map(str)
    
#     df["new_col"] = df[col1].copy()
    
#     for key in floor_mapping.keys():
#         df["new_col"] = merge_series(df["new_col"], df[key])
    
#     return df["new_col"]


def remove_non_housing(df:PandasDataFrame, col1:PandasSeries) -> PandasDataFrame:
    '''
    Remove items from the housing list that are not housing. Includes addresses including
    the text 'parking', 'garage', 'store', 'visitor bay', 'visitors room', and 'bike rack',
    'yard', 'workshop'
    '''
    df_copy = df.copy()[~df[col1].str.lower().str.contains(\
    r"parking|garage|\bstore\b|\bstores\b|\bvisitor bay\b|visitors room|\bbike rack\b|\byard\b|\bworkshop\b")]
                                                                 
    return df_copy

def extract_prop_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Extract property number from an address. Remove flat/apartment/room numbers, 
    then extract the last number/number + letter in the string.
    '''
    try:
        prop_no = df[col1].str.replace(r"(^\bapartment\b \w+)|(^\bapartments\b \w+)", "", regex=True\
                                  ).str.replace(r"(^\bflat\b \w+)|(^\bflats\b \w+)", "", regex=True\
                                               ).str.replace(r"(^\broom\b \w+)|(^\brooms\b \w+)", "", regex=True\
                                                            ).str.replace(",", "", regex=True\
                                                              ).str.extract(r"(\d+\w+|\d+)(?!.*\d+)") #"(\d+\w+|\d+)(?!.*\d+)" 
    except:
        prop_no = np.nan
        
    return prop_no

def extract_room_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Extract room number from an address. Find rows where the address contains 'room', then extract
    the next word after 'room' in the string.
    '''
    try:
        room_no = df[df[col1].str.lower().str.contains(r"\broom\b|\brooms\b",regex=True\
                                )][col1].str.replace("no.","").str.extract(r'room. (\w+)',regex=True\
                                                        ).rename(columns = {0:"room_number"})
    except:
        room_no = np.nan
        
    return room_no

def extract_flat_and_other_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Extract flat number from an address. 
    It looks for letters after a property number IF THERE ARE NO MORE NUMBERS IN THE STRING,
    the words following the words 'flat' or 'apartment', or
    the last regex selects all characters in a word containing a digit if there are two numbers in the address
    # ^\d+([a-z]|[A-Z])
    '''  
    
    #  the regex  essentially matches strings that satisfy any of the following conditions:

    # Start with a number followed by a single letter (either lowercase or uppercase) and not followed by any other number.
    # Contain the word "flat" or "apartment".
    # Start with a number, followed by any characters that are not alphanumeric (denoted by [^a-zA-Z0-9_]), and then another number.

    replaced_series = df[df[col1].str.lower().str.replace(r"^\bflats\b","flat", regex=True).\
                         str.contains(\
            r"^\d+([a-z]|[A-Z])(?!.*\d+)|^([a-z] |[A-Z] )(?!.*\d+)|\bflat\b|\bapartment\b|(\d+.*?)[^a-zA-Z0-9_].*?\d+")][col1].str.replace("no.","", regex=True)
    
    extracted_series = replaced_series.str.extract(r'^\d+([a-z]|[A-Z])(?!.*\d+)')[0]

    extracted_series = extracted_series[~extracted_series.index.duplicated()]
    df = df[~df.index.duplicated(keep='first')]

    df["prop_number"] = extracted_series

    extracted_series = replaced_series.str.extract(r'(?i)(?:flat|flats) (\w+)')
    if 1 in  extracted_series.columns:  
        df["flat_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
    else: 
        df["flat_number"] = extracted_series[0]

    extracted_series = replaced_series.str.extract(r'(?i)(?:apartment|apartments) (\w+)')
    if 1 in  extracted_series.columns:  
        df["apart_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
    else: 
        df["apart_number"] = extracted_series[0]
    
    df["first_sec_number"] = replaced_series.str.extract(r'(\d+.*?)[^a-zA-Z0-9_].*?\d+')
    df["first_letter_flat_number"] = replaced_series.str.extract(r'\b([A-Za-z])\b[^\d]* \d')
    df["first_letter_no_more_numbers"] = replaced_series.str.extract(r'^([a-z] |[A-Z] )(?!.*\d+)')
        
    return df

def extract_house_or_court_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Extract house or court name. Extended to include estate, buildings, and mansions
    '''
    extracted_series = df[col1].str.extract(r"(\w+)\s+(house|court|estate|buildings|mansions)")
    if 1 in  extracted_series.columns:  
        df["house_court_name"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
    else: 
        df["house_court_name"] = extracted_series[0]

    return df["house_court_name"]

def extract_block_and_unit_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
    '''
    Extract house or court name. Extended to include estate, buildings, and mansions
    '''

    extracted_series = df[col1].str.extract(r'(?i)(?:block|blocks) (\w+)')
    if 1 in  extracted_series.columns:  
        df["block_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
    else: 
        df["block_number"] = extracted_series[0]

    extracted_series = df[col1].str.extract(r'(?i)(?:unit|units) (\w+)')
    if 1 in  extracted_series.columns:  
        df["unit_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
    else: 
        df["unit_number"] = extracted_series[0]

    return df

def extract_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
    '''
    Extract a postcode from a string column in a dataframe
    '''
    postcode_series = df[col].str.upper().str.extract(pat = \
    "(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
    
    return postcode_series

def remove_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
    '''
    Remove a postcode from a string column in a dataframe
    '''

    
    address_series_no_pcode = df[col].str.upper().str.replace(\
    "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","",\
                                                              regex=True
                                                               ).str.lower()
    
    return address_series_no_pcode

# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
def check_no_number_addresses(df:PandasDataFrame, in_address_series:PandasSeries) -> PandasSeries:
    '''
    Highlight addresses from a pandas df where there are no numbers in the address.
    '''
    df["in_address_series_temp"] = df[in_address_series].str.lower()

    no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)

    df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"

    df = df.drop("in_address_series_temp", axis = 1)

    print(df[["full_address", "Excluded from search"]])

    return df

# Exclude non-postal addresses
def remove_non_postal(df, in_address_series):
    '''
    Highlight non-postal addresses from a pandas df where a string series that contain specific substrings
    indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
    '''
    df["in_address_series_temp"] = df[in_address_series].str.lower()

    garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
    parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
    shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
    bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
    bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)

    non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
    
    df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"

    df = df.drop("in_address_series_temp", axis = 1)

    return df

def replace_mistaken_dates(df:PandasDataFrame, col:str) -> PandasSeries:
    '''
    Identify addresses that mistakenly have dates in them and replace these dates with number values
    '''
    # Regex pattern to identify the date-month format
    pattern = r'(\d{2})-(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
    
    # Dictionary to map month abbreviations to numbers
    month_map = {
        'jan': '1', 'feb': '2', 'mar': '3', 'apr': '4', 'may': '5', 'jun': '6',
        'jul': '7', 'aug': '8', 'sep': '9', 'oct': '10', 'nov': '11', 'dec': '12'
    }
    
    # Custom replacement function
    def replace_month(match):
        day = match.group(1).lstrip('0')  # Get the day and remove leading zeros
        month = month_map[match.group(2)]  # Convert month abbreviation to number
        return f"{day}-{month}"
    
    # Apply the regex replacement
    corrected_addresses = df[col].str.replace(pattern, replace_month, regex = True)

    return corrected_addresses

def merge_series(full_series: pd.Series, partially_filled_series: pd.Series) -> pd.Series:
    '''
    Merge two series. The 'full_series' is the series you want to replace values in
    'partially_filled_series' is the replacer series.
    '''
    replacer_series_is_null = partially_filled_series.isnull()

    # Start with full_series values
    merged_series = full_series.copy()

    # Replace values in merged_series where partially_filled_series is not null
    merged_series[~replacer_series_is_null] = partially_filled_series.dropna()

    return merged_series

def clean_cols(col:str) -> str:
    return col.lower().strip().replace(r" ", "_").strip()