import os import pandas as pd import pickle import torch import zipfile from typing import List, Union, Type, Dict from pydantic import BaseModel from .pytorch_models import * PandasDataFrame = Type[pd.DataFrame] PandasSeries = Type[pd.Series] def get_or_create_env_var(var_name, default_value): # Get the environment variable if it exists value = os.environ.get(var_name) # If it doesn't exist, set it to the default value if value is None: os.environ[var_name] = default_value value = default_value return value # Retrieving or setting output folder env_var_name = 'GRADIO_OUTPUT_FOLDER' default_value = 'output/' output_folder = get_or_create_env_var(env_var_name, default_value) print(f'The value of {env_var_name} is {output_folder}') # + ''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio, token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio details here: https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings''' fuzzy_scorer_used = "token_set_ratio" fuzzy_match_limit = 85 fuzzy_search_addr_limit = 20 filter_to_lambeth_pcodes= True standardise = False if standardise == True: std = "_std" if standardise == False: std = "_not_std" dataset_name = "data" + std suffix_used = dataset_name + "_" + fuzzy_scorer_used # https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model # Uncomment these lines for the tensorflow model #model_type = "tf" #model_stub = "addr_model_out_lon" #model_version = "00000001" #file_step_suffix = "550" # I add a suffix to output files to be able to separate comparisons of test data from the same model with different steps e.g. '350' indicates a model that has been through 350,000 steps of training # Uncomment these lines for the pytorch model model_type = "lstm" model_stub = "pytorch/lstm" model_version = "" file_step_suffix = "" data_sample_size = 476887 N_EPOCHS = 10 max_predict_len = 12000 word_to_index = {} cat_to_idx = {} vocab = [] device = "cpu" global labels_list labels_list = [] ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..')) # If in a non-standard location (e.g. on AWS Lambda Function URL, then save model to tmp drive) if output_folder == "output/": out_model_dir = ROOT_DIR print(out_model_dir) else: out_model_dir = output_folder[:-1] print(out_model_dir) model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version) model_path = os.path.join(model_dir_name, "saved_model.zip") print("Model zip path: ", model_path) if os.path.exists(model_path): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues device = "cpu" ## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on ''' Load pre-trained model ''' with zipfile.ZipFile(model_path,"r") as zip_ref: zip_ref.extractall(out_model_dir) # if model_stub == "addr_model_out_lon": #import tensorflow as tf #tf.config.list_physical_devices('GPU') # # Number of labels in total (+1 for the blank category) # n_labels = len(labels_list) + 1 # # Allowable characters for the encoded representation # vocab = list(string.digits + string.ascii_lowercase + string.punctuation + string.whitespace) # #print("Loading TF model") # exported_model = tf.saved_model.load(model_dir_name) # labels_list = [ # 'SaoText', # 1 # 'SaoStartNumber', # 2 # 'SaoStartSuffix', # 3 # 'SaoEndNumber', # 4 # 'SaoEndSuffix', # 5 # 'PaoText', # 6 # 'PaoStartNumber', # 7 # 'PaoStartSuffix', # 8 # 'PaoEndNumber', # 9 # 'PaoEndSuffix', # 10 # 'Street', # 11 # 'PostTown', # 12 # 'AdministrativeArea', #13 # 'Postcode' # 14 # ] if "pytorch" in model_stub: labels_list = [ 'SaoText', # 1 'SaoStartNumber', # 2 'SaoStartSuffix', # 3 'SaoEndNumber', # 4 'SaoEndSuffix', # 5 'PaoText', # 6 'PaoStartNumber', # 7 'PaoStartSuffix', # 8 'PaoEndNumber', # 9 'PaoEndSuffix', # 10 'Street', # 11 'PostTown', # 12 'AdministrativeArea', #13 'Postcode', # 14 'IGNORE' ] if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") : # Load vocab and word_to_index with open(out_model_dir + "/vocab.txt", "r") as f: vocab = eval(f.read()) with open(out_model_dir + "/word_to_index.txt", "r") as f: word_to_index = eval(f.read()) with open(out_model_dir + "/cat_to_idx.txt", "r") as f: cat_to_idx = eval(f.read()) VOCAB_SIZE = len(word_to_index) OUTPUT_DIM = len(cat_to_idx) + 1 # Number of classes/categories EMBEDDING_DIM = 48 DROPOUT = 0.1 PAD_TOKEN = 0 if model_type == "transformer": NHEAD = 4 NUM_ENCODER_LAYERS = 1 exported_model = TransformerClassifier(VOCAB_SIZE, EMBEDDING_DIM, NHEAD, NUM_ENCODER_LAYERS, OUTPUT_DIM, DROPOUT, PAD_TOKEN) elif model_type == "gru": N_LAYERS = 3 HIDDEN_DIM = 128 exported_model = TextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN) elif model_type == "lstm": N_LAYERS = 3 HIDDEN_DIM = 128 exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN) out_model_file_name = "output_model_" + str(data_sample_size) +\ "_" + str(N_EPOCHS) + "_" + model_type + ".pth" out_model_path = os.path.join(out_model_dir, out_model_file_name) print("Model location: ", out_model_path) exported_model.load_state_dict(torch.load(out_model_path, map_location=torch.device('cpu'), weights_only=False)) exported_model.eval() device='cpu' #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') exported_model.to(device) else: exported_model = [] #tf.keras.models.load_model(model_dir_name, compile=False) # Compile the model with a loss function and an optimizer #exported_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy']) else: exported_model = [] ### ADDRESS MATCHING FUNCTIONS # Address matcher will try to match records in one go to avoid exceeding memory limits. batch_size = 10000 ref_batch_size = 150000 ### Fuzzy match method ''' https://recordlinkage.readthedocs.io/en/latest/ref_df-compare.html#recordlinkage.compare.String The Python Record Linkage Toolkit uses the jellyfish package for the Jaro, Jaro-Winkler, Levenshtein and Damerau- Levenshtein algorithms. Options are [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’] Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html ''' fuzzy_method = "jarowinkler" # Required overall match score for all columns to count as a match score_cut_off = 98.7 # 97.5 # I set a higher score cut off for nnet street blocking based on empirical data. Under this match value I was seeing errors. This value was (.99238), but set here to .995 to be maximally stringent. It is set in 'recordlinkage_funcs.py', score_based_match function score_cut_off_nnet_street = 99.5 # 99.238 # If there are no numbers in the address, then the matcher needs to get a perfect score (otherwise too many issues). no_number_fuzzy_match_limit = 100 # Reference data 'official' column names ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix", "SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber", "PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"] # Create a list of matching variables. Text columns will be fuzzy matched. matching_variables = ref_address_cols text_columns = ["Organisation", "PaoText", "Street", "PostTown", "Postcode"] # Modify relative importance of columns (weights) for the recordlinkage part of the match. Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important Organisation_weight = 0.1 # Organisation weight is very low just to resolve tie breakers for very similar addresses PaoStartNumber_weight = 2 SaoStartNumber_weight = 2 Street_weight = 2 PostTown_weight = 0 Postcode_weight = 0.5 AdministrativeArea_weight = 0 # - weight_vals = [1] * len(ref_address_cols) weight_keys = ref_address_cols weights = {weight_keys[i]: weight_vals[i] for i in range(len(weight_keys))} # + # Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important weights["Organisation"] = Organisation_weight weights["SaoStartNumber"] = SaoStartNumber_weight weights["PaoStartNumber"] = PaoStartNumber_weight weights["Street"] = Street_weight weights["PostTown"] = PostTown_weight weights["Postcode"] = Postcode_weight # Creating Pydantic basemodel class class MatcherClass(BaseModel): # Fuzzy/general attributes fuzzy_scorer_used: str fuzzy_match_limit: int fuzzy_search_addr_limit: int filter_to_lambeth_pcodes: bool standardise: bool suffix_used: str # Neural net attributes matching_variables: List[str] model_dir_name: str file_step_suffix: str exported_model: List fuzzy_method: str score_cut_off: float text_columns: List[str] weights: dict model_type: str labels_list: List[str] # These are variables that are added on later # Pytorch optional variables word_to_index: dict cat_to_idx: dict device: str vocab: List[str] # Join data file_name: str ref_name: str search_df: pd.DataFrame excluded_df: pd.DataFrame pre_filter_search_df: pd.DataFrame search_address_cols: List[str] search_postcode_col: List[str] search_df_key_field: str ref_df: pd.DataFrame ref_pre_filter: pd.DataFrame ref_address_cols: List[str] new_join_col: List[str] #in_joincol_list: List[str] existing_match_cols: List[str] standard_llpg_format: List[str] # Results attributes match_results_output: pd.DataFrame predict_df_nnet: pd.DataFrame # Other attributes generated during training compare_all_candidates: List[str] diag_shortlist: List[str] diag_best_match: List[str] results_on_orig_df: pd.DataFrame summary: str output_summary: str match_outputs_name: str results_orig_df_name: str search_df_after_stand: pd.DataFrame ref_df_after_stand: pd.DataFrame search_df_after_full_stand: pd.DataFrame ref_df_after_full_stand: pd.DataFrame search_df_after_stand_series: pd.Series ref_df_after_stand_series: pd.Series search_df_after_stand_series_full_stand: pd.Series ref_df_after_stand_series_full_stand: pd.Series # Abort flag if the matcher couldn't even get the results of the first match abort_flag: bool # This is to allow for Pandas DataFrame types as an argument class Config: # Allow for custom types such as Pandas DataFrames in the class arbitrary_types_allowed = True extra = 'allow' # Disable protected namespaces to avoid conflicts protected_namespaces = () # Creating an instance of MatcherClass InitMatch = MatcherClass( # Fuzzy/general attributes fuzzy_scorer_used = fuzzy_scorer_used, fuzzy_match_limit = fuzzy_match_limit, fuzzy_search_addr_limit = fuzzy_search_addr_limit, filter_to_lambeth_pcodes = filter_to_lambeth_pcodes, standardise = standardise, suffix_used = suffix_used, # Neural net attributes matching_variables = matching_variables, model_dir_name = model_dir_name, file_step_suffix = file_step_suffix, exported_model = [exported_model], fuzzy_method = fuzzy_method, score_cut_off = score_cut_off, text_columns = text_columns, weights = weights, model_type = model_type, labels_list = labels_list, # These are variables that are added on later # Pytorch optional variables word_to_index = word_to_index, cat_to_idx = cat_to_idx, device = device, vocab = vocab, # Join data file_name = '', ref_name = '', df_name = '', search_df = pd.DataFrame(), excluded_df = pd.DataFrame(), pre_filter_search_df = pd.DataFrame(), search_df_not_matched = pd.DataFrame(), search_df_cleaned = pd.DataFrame(), search_address_cols = [], search_postcode_col = [], search_df_key_field = 'index', ref_df = pd.DataFrame(), ref_df_cleaned = pd.DataFrame(), ref_pre_filter = pd.DataFrame(), ref_address_cols = [], new_join_col = [], #in_joincol_list = [], existing_match_cols = [], standard_llpg_format = [], # Results attributes match_results_output = pd.DataFrame(), predict_df_nnet = pd.DataFrame(), # Other attributes generated during training compare_all_candidates = [], diag_shortlist = [], diag_best_match = [], results_on_orig_df = pd.DataFrame(), summary = "", output_summary = "", match_outputs_name = "", results_orig_df_name = "", # Post dataset preparation variables search_df_after_stand = pd.DataFrame(), ref_df_after_stand = pd.DataFrame(), search_df_after_stand_series = pd.Series(), ref_df_after_stand_series = pd.Series(), search_df_after_full_stand = pd.DataFrame(), ref_df_after_full_stand = pd.DataFrame(), search_df_after_stand_series_full_stand = pd.Series(), ref_df_after_stand_series_full_stand = pd.Series(), # Abort flag if the matcher couldn't even get the results of the first match abort_flag = False )