Spaces:
Running
Running
Commit
·
36bca81
1
Parent(s):
b977e79
Removed direct AWS bucket reference. A few minor bug fixes.
Browse files- tools/aws_functions.py +1 -1
- tools/fuzzy_match.py +5 -1
- tools/matcher_funcs.py +149 -143
tools/aws_functions.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
-
bucket_name = '
|
10 |
|
11 |
try:
|
12 |
session = boto3.Session(profile_name="default")
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
+
bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
|
10 |
|
11 |
try:
|
12 |
session = boto3.Session(profile_name="default")
|
tools/fuzzy_match.py
CHANGED
@@ -212,6 +212,10 @@ def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cl
|
|
212 |
joined_ref_cols = ["fulladdress", "Reference file"]
|
213 |
joined_ref_cols.extend(new_join_col)
|
214 |
|
|
|
|
|
|
|
|
|
215 |
match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
|
216 |
|
217 |
# Convert long keys to string to avoid data loss
|
@@ -391,7 +395,7 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
|
|
391 |
"reference_orig_address":"Reference matched address",
|
392 |
"full_match":"Matched with reference address",
|
393 |
'uprn':'UPRN'
|
394 |
-
})
|
395 |
|
396 |
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
|
397 |
ref_df_after_stand_cols.extend(new_join_col)
|
|
|
212 |
joined_ref_cols = ["fulladdress", "Reference file"]
|
213 |
joined_ref_cols.extend(new_join_col)
|
214 |
|
215 |
+
print("joined_ref_cols: ", joined_ref_cols)
|
216 |
+
# Keep only columns that exist in reference dataset
|
217 |
+
joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
|
218 |
+
|
219 |
match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
|
220 |
|
221 |
# Convert long keys to string to avoid data loss
|
|
|
395 |
"reference_orig_address":"Reference matched address",
|
396 |
"full_match":"Matched with reference address",
|
397 |
'uprn':'UPRN'
|
398 |
+
}, errors="ignore")
|
399 |
|
400 |
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
|
401 |
ref_df_after_stand_cols.extend(new_join_col)
|
tools/matcher_funcs.py
CHANGED
@@ -474,185 +474,189 @@ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame,
|
|
474 |
|
475 |
return Matcher
|
476 |
|
477 |
-
def check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api):
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
Matcher.search_df = data_state
|
490 |
|
491 |
-
|
|
|
|
|
|
|
492 |
|
493 |
-
|
494 |
|
495 |
-
|
496 |
-
|
497 |
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
|
515 |
|
516 |
-
|
517 |
-
|
518 |
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
|
523 |
-
|
524 |
-
|
525 |
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
|
|
|
|
|
|
533 |
|
534 |
-
|
535 |
-
print("In colnames: ", in_colnames)
|
536 |
|
537 |
-
|
538 |
-
|
539 |
|
540 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
|
542 |
-
|
543 |
-
Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
|
544 |
-
Matcher.search_postcode_col = ['full_address_postcode']
|
545 |
-
Matcher.search_address_cols.append('full_address_postcode')
|
546 |
|
547 |
-
|
548 |
-
|
|
|
|
|
549 |
|
550 |
-
|
551 |
-
if "Matched with reference address" in Matcher.search_df.columns:
|
552 |
-
Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
|
553 |
-
else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
|
554 |
-
|
555 |
-
print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
|
556 |
|
557 |
-
### Filter addresses to those with length > 0
|
558 |
-
zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
|
559 |
-
zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
|
560 |
-
Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
|
561 |
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
|
572 |
-
postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
|
573 |
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
|
580 |
-
|
581 |
-
|
582 |
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
-
|
585 |
-
|
586 |
-
previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
|
587 |
-
Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
|
588 |
-
|
589 |
-
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
|
590 |
-
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
|
591 |
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
|
596 |
-
|
597 |
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
|
614 |
-
Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
615 |
-
Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
616 |
|
617 |
-
|
|
|
618 |
|
|
|
619 |
|
620 |
-
# If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
|
621 |
-
if in_api:
|
622 |
|
623 |
-
|
624 |
-
|
625 |
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
|
|
|
|
|
|
630 |
|
|
|
|
|
|
|
631 |
else:
|
632 |
-
|
633 |
-
Matcher.file_name = in_text
|
634 |
-
else:
|
635 |
-
Matcher.file_name = "API call"
|
636 |
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
|
647 |
-
|
648 |
-
|
649 |
|
650 |
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
|
655 |
-
|
656 |
|
657 |
def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
|
658 |
'''
|
@@ -687,8 +691,8 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
|
|
687 |
Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
|
688 |
Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
|
689 |
|
690 |
-
|
691 |
-
|
692 |
|
693 |
return Matcher
|
694 |
|
@@ -809,7 +813,8 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
|
|
809 |
Matcher.ref_df_after_stand,
|
810 |
Matcher.ref_df_after_full_stand,
|
811 |
Matcher.fuzzy_match_limit,
|
812 |
-
Matcher.fuzzy_scorer_used
|
|
|
813 |
if match_results_output.empty:
|
814 |
print("Match results empty")
|
815 |
Matcher.abort_flag = True
|
@@ -848,7 +853,8 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
|
|
848 |
Matcher.search_df_cleaned,
|
849 |
Matcher.ref_df_after_stand,
|
850 |
Matcher.search_df_after_stand,
|
851 |
-
Matcher.search_df_after_full_stand
|
|
|
852 |
|
853 |
if match_results_output.empty:
|
854 |
print("Match results empty")
|
@@ -886,7 +892,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
886 |
ref_df_after_full_stand:PandasDataFrame,
|
887 |
fuzzy_match_limit:float,
|
888 |
fuzzy_scorer_used:str,
|
889 |
-
new_join_col:List[str]
|
890 |
fuzzy_search_addr_limit:float = 100,
|
891 |
filter_to_lambeth_pcodes:bool=False):
|
892 |
|
@@ -1048,7 +1054,7 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1048 |
ref_df_after_stand:PandasDataFrame,
|
1049 |
search_df_after_stand:PandasDataFrame,
|
1050 |
search_df_after_full_stand:PandasDataFrame,
|
1051 |
-
new_join_col:List
|
1052 |
'''
|
1053 |
Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
|
1054 |
'''
|
|
|
474 |
|
475 |
return Matcher
|
476 |
|
477 |
+
def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, results_data_state:PandasDataFrame, in_file:List[str], in_text:str, in_colnames:List[str], in_joincol:List[str], in_existing:List[str], in_api:List[str]):
|
478 |
+
'''
|
479 |
+
Check if data to be matched exists. Filter it according to which records are relevant in the reference dataset
|
480 |
+
'''
|
481 |
|
482 |
+
# Assign join field if not known
|
483 |
+
if not Matcher.search_df_key_field:
|
484 |
+
Matcher.search_df_key_field = "index"
|
485 |
|
486 |
+
# Set search address cols as entered column names
|
487 |
+
#print("In colnames in check match data: ", in_colnames)
|
488 |
+
Matcher.search_address_cols = in_colnames
|
|
|
489 |
|
490 |
+
# Check if data loaded already and bring it in
|
491 |
+
if not data_state.empty:
|
492 |
+
|
493 |
+
Matcher.search_df = data_state
|
494 |
|
495 |
+
Matcher.search_df['index'] = Matcher.search_df.reset_index().index
|
496 |
|
497 |
+
else:
|
498 |
+
Matcher.search_df = pd.DataFrame()
|
499 |
|
500 |
+
# If someone has just entered open text, just load this instead
|
501 |
+
if in_text:
|
502 |
+
Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
|
503 |
|
504 |
+
# If two matcher files are loaded in, the algorithm will combine them together
|
505 |
+
if Matcher.search_df.empty and in_file:
|
506 |
+
output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
|
507 |
|
508 |
+
file_list = [string.name for string in in_file]
|
509 |
+
data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
|
510 |
+
|
511 |
+
#print("Data file names: ", data_file_names)
|
512 |
+
Matcher.file_name = get_file_name(data_file_names[0])
|
513 |
+
|
514 |
+
# search_df makes column to use as index
|
515 |
+
Matcher.search_df['index'] = Matcher.search_df.index
|
516 |
|
517 |
|
518 |
+
# Join previously created results file onto search_df if previous results file exists
|
519 |
+
if not results_data_state.empty:
|
520 |
|
521 |
+
print("Joining on previous results file")
|
522 |
+
Matcher.results_on_orig_df = results_data_state.copy()
|
523 |
+
Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
|
524 |
|
525 |
+
# If no join on column suggested, assume the user wants the UPRN
|
526 |
+
print("in_joincol: ", in_joincol)
|
527 |
|
528 |
+
if not in_joincol:
|
529 |
+
Matcher.new_join_col = ['UPRN']
|
530 |
+
#Matcher.new_join_col = Matcher.new_join_col#[0]
|
531 |
+
|
532 |
+
else:
|
533 |
+
Matcher.new_join_col = in_joincol
|
534 |
+
#Matcher.new_join_col = Matcher.new_join_col
|
535 |
+
|
536 |
+
# Extract the column names from the input data
|
537 |
+
#print("In colnames: ", in_colnames)
|
538 |
|
539 |
+
print("Matcher.in_joincol: ", Matcher.new_join_col)
|
|
|
540 |
|
541 |
+
if len(in_colnames) > 1:
|
542 |
+
Matcher.search_postcode_col = [in_colnames[-1]]
|
543 |
|
544 |
+
#print("Postcode col: ", Matcher.search_postcode_col)
|
545 |
+
|
546 |
+
elif len(in_colnames) == 1:
|
547 |
+
Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
|
548 |
+
Matcher.search_postcode_col = ['full_address_postcode']
|
549 |
+
Matcher.search_address_cols.append('full_address_postcode')
|
550 |
+
|
551 |
+
# Check for column that indicates there are existing matches. The code will then search this column for entries, and will remove them from the data to be searched
|
552 |
+
Matcher.existing_match_cols = in_existing
|
553 |
+
|
554 |
+
if in_existing:
|
555 |
+
if "Matched with reference address" in Matcher.search_df.columns:
|
556 |
+
Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
|
557 |
+
else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
|
558 |
|
559 |
+
print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
|
|
|
|
|
|
|
560 |
|
561 |
+
### Filter addresses to those with length > 0
|
562 |
+
zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
|
563 |
+
zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
|
564 |
+
Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
|
565 |
|
566 |
+
length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
|
|
|
|
|
|
|
|
|
|
|
567 |
|
|
|
|
|
|
|
|
|
568 |
|
569 |
+
### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
|
570 |
+
if not in_api:
|
571 |
+
if Matcher.filter_to_lambeth_pcodes == True:
|
572 |
+
Matcher.search_df["postcode_search_area"] = Matcher.search_df[Matcher.search_postcode_col[0]].str.strip().str.upper().str.replace(" ", "").str[:-2]
|
573 |
+
Matcher.ref_df["postcode_search_area"] = Matcher.ref_df["Postcode"].str.strip().str.upper().str.replace(" ", "").str[:-2]
|
574 |
+
|
575 |
+
unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
|
576 |
+
postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
|
|
|
|
|
|
|
577 |
|
578 |
+
Matcher.search_df["Excluded from search"] = "Included in search"
|
579 |
+
Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
|
580 |
+
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
581 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
|
582 |
+
Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
|
583 |
|
584 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
585 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
586 |
|
587 |
+
|
588 |
+
# Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
|
589 |
+
if "Matched with reference address" in Matcher.search_df.columns:
|
590 |
+
previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
|
591 |
+
Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
|
592 |
|
593 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
|
594 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
|
|
|
|
|
|
|
|
|
|
|
595 |
|
596 |
+
else:
|
597 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
598 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
599 |
|
600 |
+
print("Shape of ref_df before filtering is: ", Matcher.ref_df.shape)
|
601 |
|
602 |
+
unique_search_pcode_area = (Matcher.search_df["postcode_search_area"]).unique()
|
603 |
+
postcode_found_in_ref = Matcher.ref_df["postcode_search_area"].isin(unique_search_pcode_area)
|
604 |
+
Matcher.ref_df = Matcher.ref_df[postcode_found_in_ref]
|
605 |
|
606 |
+
Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("postcode_search_area", axis = 1)
|
607 |
+
Matcher.search_df = Matcher.search_df.drop("postcode_search_area", axis = 1)
|
608 |
+
Matcher.ref_df = Matcher.ref_df.drop("postcode_search_area", axis = 1)
|
609 |
+
Matcher.excluded_df = Matcher.excluded_df.drop("postcode_search_area", axis = 1)
|
610 |
+
else:
|
611 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()
|
612 |
+
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
613 |
+
|
614 |
+
Matcher.excluded_df = Matcher.search_df[~(length_more_than_0)]
|
615 |
+
Matcher.search_df = Matcher.search_df[length_more_than_0]
|
|
|
616 |
|
|
|
|
|
617 |
|
618 |
+
Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
619 |
+
Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
620 |
|
621 |
+
Matcher.search_df_not_matched = Matcher.search_df
|
622 |
|
|
|
|
|
623 |
|
624 |
+
# If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
|
625 |
+
if in_api:
|
626 |
|
627 |
+
if in_file:
|
628 |
+
output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
|
629 |
+
|
630 |
+
file_list = [string.name for string in in_file]
|
631 |
+
data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
|
632 |
+
|
633 |
+
Matcher.file_name = get_file_name(data_file_names[0])
|
634 |
|
635 |
+
else:
|
636 |
+
if in_text:
|
637 |
+
Matcher.file_name = in_text
|
638 |
else:
|
639 |
+
Matcher.file_name = "API call"
|
|
|
|
|
|
|
640 |
|
641 |
+
# Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
|
642 |
+
if in_existing:
|
643 |
+
print("Checking for previously matched records")
|
644 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()
|
645 |
+
previously_matched = ~Matcher.pre_filter_search_df[in_existing].isnull()
|
646 |
+
Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
|
647 |
+
|
648 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(length_more_than_0) | (previously_matched)]
|
649 |
+
Matcher.search_df = Matcher.search_df[(length_more_than_0) & ~(previously_matched)]
|
650 |
|
651 |
+
if type(Matcher.search_df) == str: search_df_cleaned, search_df_key_field, search_address_cols = prepare_search_address_string(Matcher.search_df)
|
652 |
+
else: search_df_cleaned = prepare_search_address(Matcher.search_df, Matcher.search_address_cols, Matcher.search_postcode_col, Matcher.search_df_key_field)
|
653 |
|
654 |
|
655 |
+
Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
|
656 |
+
#Matcher.search_df = Matcher.search_df.reset_index(drop=True)
|
657 |
+
#Matcher.search_df.index.name = 'index'
|
658 |
|
659 |
+
return Matcher
|
660 |
|
661 |
def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
|
662 |
'''
|
|
|
691 |
Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
|
692 |
Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
|
693 |
|
694 |
+
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
695 |
+
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
696 |
|
697 |
return Matcher
|
698 |
|
|
|
813 |
Matcher.ref_df_after_stand,
|
814 |
Matcher.ref_df_after_full_stand,
|
815 |
Matcher.fuzzy_match_limit,
|
816 |
+
Matcher.fuzzy_scorer_used,
|
817 |
+
Matcher.new_join_col)
|
818 |
if match_results_output.empty:
|
819 |
print("Match results empty")
|
820 |
Matcher.abort_flag = True
|
|
|
853 |
Matcher.search_df_cleaned,
|
854 |
Matcher.ref_df_after_stand,
|
855 |
Matcher.search_df_after_stand,
|
856 |
+
Matcher.search_df_after_full_stand,
|
857 |
+
Matcher.new_join_col)
|
858 |
|
859 |
if match_results_output.empty:
|
860 |
print("Match results empty")
|
|
|
892 |
ref_df_after_full_stand:PandasDataFrame,
|
893 |
fuzzy_match_limit:float,
|
894 |
fuzzy_scorer_used:str,
|
895 |
+
new_join_col:List[str],
|
896 |
fuzzy_search_addr_limit:float = 100,
|
897 |
filter_to_lambeth_pcodes:bool=False):
|
898 |
|
|
|
1054 |
ref_df_after_stand:PandasDataFrame,
|
1055 |
search_df_after_stand:PandasDataFrame,
|
1056 |
search_df_after_full_stand:PandasDataFrame,
|
1057 |
+
new_join_col:List[str]):
|
1058 |
'''
|
1059 |
Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
|
1060 |
'''
|