Spaces:

seanpedrickcase
/

address_matcher

Running

File size: 17,406 Bytes

dd1cbb4

import pandas as pd
from typing import Type, Dict, List, Tuple
from datetime import datetime
#import polars as pl
import re

PandasDataFrame = Type[pd.DataFrame]
PandasSeries = Type[pd.Series]
MatchedResults = Dict[str,Tuple[str,int]]
array = List[str]

today = datetime.now().strftime("%d%m%Y")
today_rev = datetime.now().strftime("%Y%m%d")


def prepare_search_address_string(
    search_str: str
) -> Tuple[pd.DataFrame, str, List[str], List[str]]:
    """Extracts address and postcode from search_str into new DataFrame"""

    # Validate input
    if not isinstance(search_str, str):
        raise TypeError("search_str must be a string")
        
    search_df = pd.DataFrame(data={"full_address":[search_str]})

    #print(search_df)
    
    # Extract postcode 
    postcode_series = extract_postcode(search_df, "full_address").dropna(axis=1)[0]

    # Remove postcode from address
    address_series = remove_postcode(search_df, "full_address")

    # Construct output DataFrame
    search_df_out = pd.DataFrame()
    search_df_out["full_address"] = address_series
    search_df_out["postcode"] = postcode_series

    # Set key field for joining
    key_field = "index"

    # Reset index to use as key field
    search_df_out = search_df_out.reset_index()

    # Define column names to return
    address_cols = ["full_address"]
    postcode_col = ["postcode"]

    return search_df_out, key_field, address_cols, postcode_col

# def prepare_search_address(
#     search_df: pd.DataFrame, 
#     address_cols: list,
#     postcode_col: list,
#     key_col: str
# ) -> Tuple[pd.DataFrame, str]:
    
#     # Validate inputs
#     if not isinstance(search_df, pd.DataFrame):
#         raise TypeError("search_df must be a Pandas DataFrame")
        
#     if not isinstance(address_cols, list):
#         raise TypeError("address_cols must be a list")
        
#     if not isinstance(postcode_col, list):
#         raise TypeError("postcode_col must be a list")
        
#     if not isinstance(key_col, str):
#         raise TypeError("key_col must be a string")
        
#     # Clean address columns
#     clean_addresses = _clean_columns(search_df, address_cols)
    
#     # Join address columns into one
#     full_addresses = _join_address(clean_addresses, address_cols)
    
#     # Add postcode column 
#     full_df = _add_postcode_column(full_addresses, postcode_col)
    
#     # Remove postcode from main address if there was only one column in the input
#     if postcode_col == "full_address_postcode":
#         # Remove postcode from address
#         address_series = remove_postcode(search_df, "full_address")
#         search_df["full_address"] == address_series
    
#     # Ensure index column
#     final_df = _ensure_index(full_df, key_col)

#     #print(final_df)

    
#     return final_df, key_col

def prepare_search_address(
    search_df: pd.DataFrame, 
    address_cols: list,
    postcode_col: list,
    key_col: str
) -> Tuple[pd.DataFrame, str]:
    
    # Validate inputs
    if not isinstance(search_df, pd.DataFrame):
        raise TypeError("search_df must be a Pandas DataFrame")
        
    if not isinstance(address_cols, list):
        raise TypeError("address_cols must be a list")
        
    if not isinstance(postcode_col, list):
        raise TypeError("postcode_col must be a list")
        
    if not isinstance(key_col, str):
        raise TypeError("key_col must be a string")
        
    # Clean address columns
    #search_df_polars = pl.from_dataframe(search_df)
    clean_addresses = _clean_columns(search_df, address_cols)
    
    # Join address columns into one
    full_addresses = _join_address(clean_addresses, address_cols)
    
    # Add postcode column 
    full_df = _add_postcode_column(full_addresses, postcode_col)
    
    # Remove postcode from main address if there was only one column in the input
    if postcode_col == "full_address_postcode":
        # Remove postcode from address
        address_series = remove_postcode(search_df, "full_address")
        search_df["full_address"] == address_series
    
    # Ensure index column
    final_df = _ensure_index(full_df, key_col)

    #print(final_df)

    
    return final_df

# Helper functions
def _clean_columns(df, cols):
   # Cleaning logic
   def clean_col(col):
       return col.astype(str).fillna("").infer_objects(copy=False).str.replace("nan","").str.replace("\s{2,}", " ", regex=True).str.replace(","," ").str.strip()

   df[cols] = df[cols].apply(clean_col)
    
   return df

# def _clean_columns(df, cols):
#     # Cleaning logic
#     #print(df)

#     #if isinstance(df, pl.DataFrame):
#     #    print("It's a Polars DataFrame")

#     def clean_col(col):
#         col = col.str.replace("nan", "")
#         col = col.apply(lambda x: re.sub(r'\s{2,}', ' ', str(x)), skip_nulls=False, return_dtype=str)  # replace any spaces greater than one with one
#         return col.str.replace(",", " ").str.strip()  # replace commas with a space

#     for col in cols:
#         df = df.with_columns(clean_col(df[col]).alias(col))

#     return df

   
def _join_address(df, cols):
   # Joining logic
   full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
   df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
   
   return df
   
def _add_postcode_column(df, postcodes):
   # Add postcode column
   if isinstance(postcodes, list):
        postcodes = postcodes[0]
    
   if postcodes != "full_address_postcode":
        df = df.rename(columns={postcodes:"postcode"})
   else:
        #print(df["full_address_postcode"])
        #print(extract_postcode(df,"full_address_postcode"))
        df["full_address_postcode"] = extract_postcode(df,"full_address_postcode")[0] # 
        df = df.rename(columns={postcodes:"postcode"})
        #print(df)
   
   return df
   
def _ensure_index(df, index_col):
   # Ensure index column exists
   if ((index_col == "index") & ~("index" in df.columns)):
        print("Resetting index in _ensure_index function")   
        df = df.reset_index()

   df[index_col] = df[index_col].astype(str)

   return df

def create_full_address(df):

    df = df.fillna("").infer_objects(copy=False)

    if "Organisation" not in df.columns:
        df["Organisation"] = ""

    df["full_address"] = df['Organisation'] + " " + df['SaoText'].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["SaoStartNumber"].astype(str) + df["SaoStartSuffix"] + "-" + df["SaoEndNumber"].astype(str) + df["SaoEndSuffix"] + " " + df["PaoText"].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["PaoStartNumber"].astype(str) + df["PaoStartSuffix"] + "-" + df["PaoEndNumber"].astype(str) + df["PaoEndSuffix"] + " " + df["Street"] + " " + df["PostTown"] + " " + df["Postcode"]

    #.str.replace(r'(?<=[a-zA-Z])-(?![a-zA-Z])|(?<![a-zA-Z])-(?=[a-zA-Z])', ' ', regex=True)\
    
    #.str.replace(".0","", regex=False)\
    
    df["full_address"] = df["full_address"]\
    .str.replace("-999","")\
    .str.replace(" -"," ")\
    .str.replace("- "," ")\
    .str.replace(" REPL "," - ")\
    .str.replace(" REPLEFT ","- ")\
    .str.replace(" REPLRIGHT "," -")\
    .str.replace("\s+"," ", regex=True)\
    .str.strip()
    #.str.replace("  "," ")\
    
    return df["full_address"]

def prepare_ref_address(ref_df, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
    
    if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
    else: standard_cols = False
    
    ref_address_cols_uprn = ref_address_cols.copy()

    ref_address_cols_uprn.extend(new_join_col)
    ref_address_cols_uprn_w_ref = ref_address_cols_uprn.copy()
    ref_address_cols_uprn_w_ref.extend(["Reference file"])

    ref_df_cleaned = ref_df.copy()
      
    # In on-prem LPI db street has been excluded, so put this back in
    if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns): 
            ref_df_cleaned['Street'] = ref_df_cleaned['Address_LPI'].str.replace("\\n", " ", regex = True).apply(extract_street_name)#
        
    if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
        ref_df_cleaned['Organisation'] = ""
     
    ref_df_cleaned = ref_df_cleaned[ref_address_cols_uprn_w_ref]

    ref_df_cleaned = ref_df_cleaned.fillna("").infer_objects(copy=False)

    all_columns = list(ref_df_cleaned) # Creates list of all column headers
    ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str).fillna('').infer_objects(copy=False).replace('nan','')

    ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)

    # Create full address

    all_columns = list(ref_df_cleaned) # Creates list of all column headers
    ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str)

    ref_df_cleaned = ref_df_cleaned.replace("nan","")
    ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)
      
    if standard_cols == True:
        ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)

        ref_df_cleaned["fulladdress"] = create_full_address(ref_df_cleaned[ref_address_cols_uprn_w_ref])
    
    else: 
        ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)
        
        full_address  = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 
        ref_df_cleaned["fulladdress"] = full_address

    ref_df_cleaned["fulladdress"] = ref_df_cleaned["fulladdress"]\
    .str.replace("-999","")\
    .str.replace(" -"," ")\
    .str.replace("- "," ")\
    .str.replace(".0","", regex=False)\
    .str.replace("\s{2,}", " ", regex=True)\
    .str.strip()
    
    # Create a street column if it doesn't exist by extracting street from the full address
    
    if 'Street' not in ref_df_cleaned.columns:        
        ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)

    # Add index column
        ref_df_cleaned['ref_index'] = ref_df_cleaned.index
        
    return ref_df_cleaned

# def prepare_ref_address(ref_df:pl.DataFrame, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
    
#     if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): 
#         standard_cols = True
#     else: 
#         standard_cols = False

#     ref_address_cols_uprn = list(ref_address_cols) + new_join_col
#     ref_df_cleaned = ref_df[ref_address_cols_uprn].fill_null("")
    
#     # In on-prem LPI db street has been excluded, so put this back in
#     if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns): 
#         ref_df_cleaned = ref_df_cleaned.with_column(pl.col('Address_LPI').apply(lambda x: extract_street_name(x)).alias('Street'))  
        
#     if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns): 
#         ref_df_cleaned = ref_df_cleaned.with_column(pl.lit("").alias('Organisation'))
     
#     #ref_df_cleaned['fulladdress'] = 
        
#     if standard_cols:
#         pass
#         # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations. 
#         # However, you might need to convert string types to object type for full address creation which may require more than just a few lines of codes. 
#     else:
#         pass    
        
#         # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
      
#     if 'Street' not in ref_df_cleaned.columns:  
#         ref_df_cleaned = ref_df_cleaned.with_column(pl.col('fulladdress').apply(extract_street_name).alias("Street"))
    
#     # Add index column
#     ref_df_cleaned = ref_df_cleaned.with_column(pl.lit('').alias('ref_index'))
        
#     return ref_df_cleaned


def extract_postcode(df, col:str) -> PandasSeries:
    '''
    Extract a postcode from a string column in a dataframe
    '''
    postcode_series = df[col].str.upper().str.extract(pat = \
    "(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
    
    return postcode_series


# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
def check_no_number_addresses(df, in_address_series) -> PandasSeries:
    '''
    Highlight addresses from a pandas df where there are no numbers in the address.
    '''
    df["in_address_series_temp"] = df[in_address_series].str.lower()

    no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)

    df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"

    df = df.drop("in_address_series_temp", axis = 1)

    #print(df[["full_address", "Excluded from search"]])

    return df


def remove_postcode(df, col:str) -> PandasSeries:
    '''
    Remove a postcode from a string column in a dataframe
    '''
    address_series_no_pcode = df[col].str.upper().str.replace(\
    "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
    
    return address_series_no_pcode

def extract_street_name(address:str) -> str:
    """
    Extracts the street name from the given address.

    Args:
        address (str): The input address string.

    Returns:
        str: The extracted street name, or an empty string if no match is found.

    Examples:
        >>> address1 = "1 Ash Park Road SE54 3HB"
        >>> extract_street_name(address1)
        'Ash Park Road'

        >>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
        >>> extract_street_name(address2)
        'Ash Park Road'

        >>> address3 = "123 Main Blvd"
        >>> extract_street_name(address3)
        'Main Blvd'

        >>> address4 = "456 Maple AvEnUe"
        >>> extract_street_name(address4)
        'Maple AvEnUe'

        >>> address5 = "789 Oak Street"
        >>> extract_street_name(address5)
        'Oak Street'
    """
    
   
    street_types = [
        'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
        'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
        'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
        'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
        'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
        'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
        'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
        'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
    ]

    # Dynamically construct the regex pattern with all possible street types
    street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)

    # The overall regex pattern to capture the street name
    pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'

    def replace_postcode(address):
        pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
        return re.sub(pattern, "", address)

    
    modified_address = replace_postcode(address.upper())
    #print(modified_address)
    #print(address)
       
    # Perform a case-insensitive search
    match = re.search(pattern, modified_address, re.IGNORECASE)

    if match:
        street_name = match.group('street_name')
        return street_name.strip()
    else:
        return ""
    
    
    # Exclude non-postal addresses

def remove_non_postal(df, in_address_series):
    '''
    Highlight non-postal addresses from a polars df where a string series that contain specific substrings
    indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
    '''
    df["in_address_series_temp"] = df[in_address_series].str.lower()

    garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
    parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
    shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
    bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
    bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)

    non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
    
    df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"

    df = df.drop("in_address_series_temp", axis = 1)

    return df