seanpedrickcase commited on
Commit
36bca81
·
1 Parent(s): b977e79

Removed direct AWS bucket reference. A few minor bug fixes.

Browse files
tools/aws_functions.py CHANGED
@@ -6,7 +6,7 @@ import os
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
- bucket_name = 'address-matcher-data'
10
 
11
  try:
12
  session = boto3.Session(profile_name="default")
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
+ bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
10
 
11
  try:
12
  session = boto3.Session(profile_name="default")
tools/fuzzy_match.py CHANGED
@@ -212,6 +212,10 @@ def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cl
212
  joined_ref_cols = ["fulladdress", "Reference file"]
213
  joined_ref_cols.extend(new_join_col)
214
 
 
 
 
 
215
  match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
216
 
217
  # Convert long keys to string to avoid data loss
@@ -391,7 +395,7 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
391
  "reference_orig_address":"Reference matched address",
392
  "full_match":"Matched with reference address",
393
  'uprn':'UPRN'
394
- })
395
 
396
  ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
397
  ref_df_after_stand_cols.extend(new_join_col)
 
212
  joined_ref_cols = ["fulladdress", "Reference file"]
213
  joined_ref_cols.extend(new_join_col)
214
 
215
+ print("joined_ref_cols: ", joined_ref_cols)
216
+ # Keep only columns that exist in reference dataset
217
+ joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
218
+
219
  match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
220
 
221
  # Convert long keys to string to avoid data loss
 
395
  "reference_orig_address":"Reference matched address",
396
  "full_match":"Matched with reference address",
397
  'uprn':'UPRN'
398
+ }, errors="ignore")
399
 
400
  ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
401
  ref_df_after_stand_cols.extend(new_join_col)
tools/matcher_funcs.py CHANGED
@@ -474,185 +474,189 @@ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame,
474
 
475
  return Matcher
476
 
477
- def check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api):
478
- # Assign join field if not known
479
- if not Matcher.search_df_key_field:
480
- Matcher.search_df_key_field = "index"
481
 
482
- # Set search address cols as entered column names
483
- #print("In colnames in check match data: ", in_colnames)
484
- Matcher.search_address_cols = in_colnames
485
 
486
- # Check if data loaded already and bring it in
487
- if not data_state.empty:
488
-
489
- Matcher.search_df = data_state
490
 
491
-
 
 
 
492
 
493
- Matcher.search_df['index'] = Matcher.search_df.index
494
 
495
- else:
496
- Matcher.search_df = pd.DataFrame()
497
 
498
- # If someone has just entered open text, just load this instead
499
- if in_text:
500
- Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
501
 
502
- # If two matcher files are loaded in, the algorithm will combine them together
503
- if Matcher.search_df.empty and in_file:
504
- output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
505
 
506
- file_list = [string.name for string in in_file]
507
- data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
508
-
509
- #print("Data file names: ", data_file_names)
510
- Matcher.file_name = get_file_name(data_file_names[0])
511
-
512
- # search_df makes column to use as index
513
- Matcher.search_df['index'] = Matcher.search_df.index
514
 
515
 
516
- # Join previously created results file onto search_df if previous results file exists
517
- if not results_data_state.empty:
518
 
519
- print("Joining on previous results file")
520
- Matcher.results_on_orig_df = results_data_state.copy()
521
- Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
522
 
523
- # If no join on column suggested, assume the user wants the UPRN
524
- # print("in_joincol: ", in_joincol)
525
 
526
- if not in_joincol:
527
- Matcher.new_join_col = ['UPRN']
528
- #Matcher.new_join_col = Matcher.new_join_col#[0]
529
-
530
- else:
531
- Matcher.new_join_col = in_joincol
532
- #Matcher.new_join_col = Matcher.new_join_col
 
 
 
533
 
534
- # Extract the column names from the input data
535
- print("In colnames: ", in_colnames)
536
 
537
- if len(in_colnames) > 1:
538
- Matcher.search_postcode_col = [in_colnames[-1]]
539
 
540
- print("Postcode col: ", Matcher.search_postcode_col)
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
- elif len(in_colnames) == 1:
543
- Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
544
- Matcher.search_postcode_col = ['full_address_postcode']
545
- Matcher.search_address_cols.append('full_address_postcode')
546
 
547
- # Check for column that indicates there are existing matches. The code will then search this column for entries, and will remove them from the data to be searched
548
- Matcher.existing_match_cols = in_existing
 
 
549
 
550
- if in_existing:
551
- if "Matched with reference address" in Matcher.search_df.columns:
552
- Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
553
- else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
554
-
555
- print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
556
 
557
- ### Filter addresses to those with length > 0
558
- zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
559
- zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
560
- Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
561
 
562
- length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
563
-
564
-
565
- ### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
566
- if not in_api:
567
- if Matcher.filter_to_lambeth_pcodes == True:
568
- Matcher.search_df["postcode_search_area"] = Matcher.search_df[Matcher.search_postcode_col[0]].str.strip().str.upper().str.replace(" ", "").str[:-2]
569
- Matcher.ref_df["postcode_search_area"] = Matcher.ref_df["Postcode"].str.strip().str.upper().str.replace(" ", "").str[:-2]
570
-
571
- unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
572
- postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
573
 
574
- Matcher.search_df["Excluded from search"] = "Included in search"
575
- Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
576
- Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
577
- Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
578
- Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
579
 
580
- Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
581
- Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
582
 
 
 
 
 
 
583
 
584
- # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
585
- if "Matched with reference address" in Matcher.search_df.columns:
586
- previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
587
- Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
588
-
589
- Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
590
- Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
591
 
592
- else:
593
- Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
594
- Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
595
 
596
- print("Shape of ref_df before filtering is: ", Matcher.ref_df.shape)
597
 
598
- unique_search_pcode_area = (Matcher.search_df["postcode_search_area"]).unique()
599
- postcode_found_in_ref = Matcher.ref_df["postcode_search_area"].isin(unique_search_pcode_area)
600
- Matcher.ref_df = Matcher.ref_df[postcode_found_in_ref]
601
 
602
- Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("postcode_search_area", axis = 1)
603
- Matcher.search_df = Matcher.search_df.drop("postcode_search_area", axis = 1)
604
- Matcher.ref_df = Matcher.ref_df.drop("postcode_search_area", axis = 1)
605
- Matcher.excluded_df = Matcher.excluded_df.drop("postcode_search_area", axis = 1)
606
- else:
607
- Matcher.pre_filter_search_df = Matcher.search_df.copy()
608
- Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
609
-
610
- Matcher.excluded_df = Matcher.search_df[~(length_more_than_0)]
611
- Matcher.search_df = Matcher.search_df[length_more_than_0]
612
-
613
 
614
- Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
615
- Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
616
 
617
- Matcher.search_df_not_matched = Matcher.search_df
 
618
 
 
619
 
620
- # If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
621
- if in_api:
622
 
623
- if in_file:
624
- output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
625
 
626
- file_list = [string.name for string in in_file]
627
- data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
628
-
629
- Matcher.file_name = get_file_name(data_file_names[0])
 
 
 
630
 
 
 
 
631
  else:
632
- if in_text:
633
- Matcher.file_name = in_text
634
- else:
635
- Matcher.file_name = "API call"
636
 
637
- # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
638
- if in_existing:
639
- print("Checking for previously matched records")
640
- Matcher.pre_filter_search_df = Matcher.search_df.copy()
641
- previously_matched = ~Matcher.pre_filter_search_df[in_existing].isnull()
642
- Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
643
-
644
- Matcher.excluded_df = Matcher.search_df.copy()[~(length_more_than_0) | (previously_matched)]
645
- Matcher.search_df = Matcher.search_df[(length_more_than_0) & ~(previously_matched)]
646
 
647
- if type(Matcher.search_df) == str: search_df_cleaned, search_df_key_field, search_address_cols = prepare_search_address_string(Matcher.search_df)
648
- else: search_df_cleaned = prepare_search_address(Matcher.search_df, Matcher.search_address_cols, Matcher.search_postcode_col, Matcher.search_df_key_field)
649
 
650
 
651
- Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
652
- #Matcher.search_df = Matcher.search_df.reset_index(drop=True)
653
- #Matcher.search_df.index.name = 'index'
654
 
655
- return Matcher
656
 
657
  def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
658
  '''
@@ -687,8 +691,8 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
687
  Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
688
  Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
689
 
690
- #Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
691
- #Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
692
 
693
  return Matcher
694
 
@@ -809,7 +813,8 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
809
  Matcher.ref_df_after_stand,
810
  Matcher.ref_df_after_full_stand,
811
  Matcher.fuzzy_match_limit,
812
- Matcher.fuzzy_scorer_used)
 
813
  if match_results_output.empty:
814
  print("Match results empty")
815
  Matcher.abort_flag = True
@@ -848,7 +853,8 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
848
  Matcher.search_df_cleaned,
849
  Matcher.ref_df_after_stand,
850
  Matcher.search_df_after_stand,
851
- Matcher.search_df_after_full_stand)
 
852
 
853
  if match_results_output.empty:
854
  print("Match results empty")
@@ -886,7 +892,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
886
  ref_df_after_full_stand:PandasDataFrame,
887
  fuzzy_match_limit:float,
888
  fuzzy_scorer_used:str,
889
- new_join_col:List[str]=["UPRN"],
890
  fuzzy_search_addr_limit:float = 100,
891
  filter_to_lambeth_pcodes:bool=False):
892
 
@@ -1048,7 +1054,7 @@ def full_nn_match(ref_address_cols:List[str],
1048
  ref_df_after_stand:PandasDataFrame,
1049
  search_df_after_stand:PandasDataFrame,
1050
  search_df_after_full_stand:PandasDataFrame,
1051
- new_join_col:List=["UPRN"]):
1052
  '''
1053
  Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
1054
  '''
 
474
 
475
  return Matcher
476
 
477
+ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, results_data_state:PandasDataFrame, in_file:List[str], in_text:str, in_colnames:List[str], in_joincol:List[str], in_existing:List[str], in_api:List[str]):
478
+ '''
479
+ Check if data to be matched exists. Filter it according to which records are relevant in the reference dataset
480
+ '''
481
 
482
+ # Assign join field if not known
483
+ if not Matcher.search_df_key_field:
484
+ Matcher.search_df_key_field = "index"
485
 
486
+ # Set search address cols as entered column names
487
+ #print("In colnames in check match data: ", in_colnames)
488
+ Matcher.search_address_cols = in_colnames
 
489
 
490
+ # Check if data loaded already and bring it in
491
+ if not data_state.empty:
492
+
493
+ Matcher.search_df = data_state
494
 
495
+ Matcher.search_df['index'] = Matcher.search_df.reset_index().index
496
 
497
+ else:
498
+ Matcher.search_df = pd.DataFrame()
499
 
500
+ # If someone has just entered open text, just load this instead
501
+ if in_text:
502
+ Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
503
 
504
+ # If two matcher files are loaded in, the algorithm will combine them together
505
+ if Matcher.search_df.empty and in_file:
506
+ output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
507
 
508
+ file_list = [string.name for string in in_file]
509
+ data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
510
+
511
+ #print("Data file names: ", data_file_names)
512
+ Matcher.file_name = get_file_name(data_file_names[0])
513
+
514
+ # search_df makes column to use as index
515
+ Matcher.search_df['index'] = Matcher.search_df.index
516
 
517
 
518
+ # Join previously created results file onto search_df if previous results file exists
519
+ if not results_data_state.empty:
520
 
521
+ print("Joining on previous results file")
522
+ Matcher.results_on_orig_df = results_data_state.copy()
523
+ Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
524
 
525
+ # If no join on column suggested, assume the user wants the UPRN
526
+ print("in_joincol: ", in_joincol)
527
 
528
+ if not in_joincol:
529
+ Matcher.new_join_col = ['UPRN']
530
+ #Matcher.new_join_col = Matcher.new_join_col#[0]
531
+
532
+ else:
533
+ Matcher.new_join_col = in_joincol
534
+ #Matcher.new_join_col = Matcher.new_join_col
535
+
536
+ # Extract the column names from the input data
537
+ #print("In colnames: ", in_colnames)
538
 
539
+ print("Matcher.in_joincol: ", Matcher.new_join_col)
 
540
 
541
+ if len(in_colnames) > 1:
542
+ Matcher.search_postcode_col = [in_colnames[-1]]
543
 
544
+ #print("Postcode col: ", Matcher.search_postcode_col)
545
+
546
+ elif len(in_colnames) == 1:
547
+ Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
548
+ Matcher.search_postcode_col = ['full_address_postcode']
549
+ Matcher.search_address_cols.append('full_address_postcode')
550
+
551
+ # Check for column that indicates there are existing matches. The code will then search this column for entries, and will remove them from the data to be searched
552
+ Matcher.existing_match_cols = in_existing
553
+
554
+ if in_existing:
555
+ if "Matched with reference address" in Matcher.search_df.columns:
556
+ Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
557
+ else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
558
 
559
+ print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
 
 
 
560
 
561
+ ### Filter addresses to those with length > 0
562
+ zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
563
+ zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
564
+ Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
565
 
566
+ length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
 
 
 
 
 
567
 
 
 
 
 
568
 
569
+ ### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
570
+ if not in_api:
571
+ if Matcher.filter_to_lambeth_pcodes == True:
572
+ Matcher.search_df["postcode_search_area"] = Matcher.search_df[Matcher.search_postcode_col[0]].str.strip().str.upper().str.replace(" ", "").str[:-2]
573
+ Matcher.ref_df["postcode_search_area"] = Matcher.ref_df["Postcode"].str.strip().str.upper().str.replace(" ", "").str[:-2]
574
+
575
+ unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
576
+ postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
 
 
 
577
 
578
+ Matcher.search_df["Excluded from search"] = "Included in search"
579
+ Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
580
+ Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
581
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
582
+ Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
583
 
584
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
585
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
586
 
587
+
588
+ # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
589
+ if "Matched with reference address" in Matcher.search_df.columns:
590
+ previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
591
+ Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
592
 
593
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
594
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
 
 
 
 
 
595
 
596
+ else:
597
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
598
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
599
 
600
+ print("Shape of ref_df before filtering is: ", Matcher.ref_df.shape)
601
 
602
+ unique_search_pcode_area = (Matcher.search_df["postcode_search_area"]).unique()
603
+ postcode_found_in_ref = Matcher.ref_df["postcode_search_area"].isin(unique_search_pcode_area)
604
+ Matcher.ref_df = Matcher.ref_df[postcode_found_in_ref]
605
 
606
+ Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("postcode_search_area", axis = 1)
607
+ Matcher.search_df = Matcher.search_df.drop("postcode_search_area", axis = 1)
608
+ Matcher.ref_df = Matcher.ref_df.drop("postcode_search_area", axis = 1)
609
+ Matcher.excluded_df = Matcher.excluded_df.drop("postcode_search_area", axis = 1)
610
+ else:
611
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()
612
+ Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
613
+
614
+ Matcher.excluded_df = Matcher.search_df[~(length_more_than_0)]
615
+ Matcher.search_df = Matcher.search_df[length_more_than_0]
 
616
 
 
 
617
 
618
+ Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
619
+ Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
620
 
621
+ Matcher.search_df_not_matched = Matcher.search_df
622
 
 
 
623
 
624
+ # If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
625
+ if in_api:
626
 
627
+ if in_file:
628
+ output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
629
+
630
+ file_list = [string.name for string in in_file]
631
+ data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
632
+
633
+ Matcher.file_name = get_file_name(data_file_names[0])
634
 
635
+ else:
636
+ if in_text:
637
+ Matcher.file_name = in_text
638
  else:
639
+ Matcher.file_name = "API call"
 
 
 
640
 
641
+ # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
642
+ if in_existing:
643
+ print("Checking for previously matched records")
644
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()
645
+ previously_matched = ~Matcher.pre_filter_search_df[in_existing].isnull()
646
+ Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
647
+
648
+ Matcher.excluded_df = Matcher.search_df.copy()[~(length_more_than_0) | (previously_matched)]
649
+ Matcher.search_df = Matcher.search_df[(length_more_than_0) & ~(previously_matched)]
650
 
651
+ if type(Matcher.search_df) == str: search_df_cleaned, search_df_key_field, search_address_cols = prepare_search_address_string(Matcher.search_df)
652
+ else: search_df_cleaned = prepare_search_address(Matcher.search_df, Matcher.search_address_cols, Matcher.search_postcode_col, Matcher.search_df_key_field)
653
 
654
 
655
+ Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
656
+ #Matcher.search_df = Matcher.search_df.reset_index(drop=True)
657
+ #Matcher.search_df.index.name = 'index'
658
 
659
+ return Matcher
660
 
661
  def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
662
  '''
 
691
  Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
692
  Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
693
 
694
+ Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
695
+ Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
696
 
697
  return Matcher
698
 
 
813
  Matcher.ref_df_after_stand,
814
  Matcher.ref_df_after_full_stand,
815
  Matcher.fuzzy_match_limit,
816
+ Matcher.fuzzy_scorer_used,
817
+ Matcher.new_join_col)
818
  if match_results_output.empty:
819
  print("Match results empty")
820
  Matcher.abort_flag = True
 
853
  Matcher.search_df_cleaned,
854
  Matcher.ref_df_after_stand,
855
  Matcher.search_df_after_stand,
856
+ Matcher.search_df_after_full_stand,
857
+ Matcher.new_join_col)
858
 
859
  if match_results_output.empty:
860
  print("Match results empty")
 
892
  ref_df_after_full_stand:PandasDataFrame,
893
  fuzzy_match_limit:float,
894
  fuzzy_scorer_used:str,
895
+ new_join_col:List[str],
896
  fuzzy_search_addr_limit:float = 100,
897
  filter_to_lambeth_pcodes:bool=False):
898
 
 
1054
  ref_df_after_stand:PandasDataFrame,
1055
  search_df_after_stand:PandasDataFrame,
1056
  search_df_after_full_stand:PandasDataFrame,
1057
+ new_join_col:List[str]):
1058
  '''
1059
  Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
1060
  '''