Spaces:
Sleeping
Sleeping
Major update. General code revision. Improved config variables. Dataframe based review frame now includes text, items can be searched and excluded. Costs now estimated. Option for adding cost codes added. Option to extract text only.
0ea8b9e
from typing import List | |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult | |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts | |
import spacy | |
from spacy.matcher import Matcher, PhraseMatcher | |
from spaczz.matcher import FuzzyMatcher | |
spacy.prefer_gpu() | |
from spacy.cli.download import download | |
import Levenshtein | |
import re | |
import gradio as gr | |
model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf" | |
score_threshold = 0.001 | |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"] | |
#Load spacy model | |
try: | |
import en_core_web_lg #en_core_web_sm | |
nlp = en_core_web_lg.load() #en_core_web_sm.load() | |
print("Successfully imported spaCy model") | |
except: | |
download(model_name) | |
nlp = spacy.load(model_name) | |
print("Successfully downloaded and imported spaCy model", model_name) | |
# #### Custom recognisers | |
def custom_word_list_recogniser(custom_list:List[str]=[]): | |
# Create regex pattern, handling quotes carefully | |
quote_str = '"' | |
replace_str = '(?:"|"|")' | |
custom_regex = '|'.join( | |
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)' | |
for term in custom_list | |
) | |
#print(custom_regex) | |
custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1) | |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern], | |
global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE) | |
return custom_recogniser | |
# Initialise custom recogniser that will be overwritten later | |
custom_recogniser = custom_word_list_recogniser() | |
# Custom title recogniser | |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"] | |
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b' | |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1) | |
titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern], | |
global_regex_flags=re.DOTALL | re.MULTILINE) | |
# %% | |
# Custom postcode recogniser | |
# Define the regex pattern in a Presidio `Pattern` object: | |
ukpostcode_pattern = Pattern( | |
name="ukpostcode_pattern", | |
regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}|GIR ?0AA)\b", | |
score=1 | |
) | |
# Define the recognizer with one or more patterns | |
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern]) | |
### Street name | |
def extract_street_name(text:str) -> str: | |
""" | |
Extracts the street name and preceding word (that should contain at least one number) from the given text. | |
""" | |
street_types = [ | |
'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway', | |
'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd', | |
'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy', | |
'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate', | |
'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove', | |
'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts', | |
'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas', | |
'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side' | |
] | |
# Dynamically construct the regex pattern with all possible street types | |
street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types) | |
# The overall regex pattern to capture the street name and preceding word(s) | |
pattern = rf'(?P<preceding_word>\w*\d\w*)\s*' | |
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)' | |
# Find all matches in text | |
matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE) | |
start_positions = [] | |
end_positions = [] | |
for match in matches: | |
preceding_word = match.group('preceding_word').strip() | |
street_name = match.group('street_name').strip() | |
start_pos = match.start() | |
end_pos = match.end() | |
#print(f"Start: {start_pos}, End: {end_pos}") | |
#print(f"Preceding words: {preceding_word}") | |
#print(f"Street name: {street_name}") | |
start_positions.append(start_pos) | |
end_positions.append(end_pos) | |
return start_positions, end_positions | |
class StreetNameRecognizer(EntityRecognizer): | |
def load(self) -> None: | |
"""No loading is required.""" | |
pass | |
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]: | |
""" | |
Logic for detecting a specific PII | |
""" | |
start_pos, end_pos = extract_street_name(text) | |
results = [] | |
for i in range(0, len(start_pos)): | |
result = RecognizerResult( | |
entity_type="STREETNAME", | |
start = start_pos[i], | |
end = end_pos[i], | |
score= 1 | |
) | |
results.append(result) | |
return results | |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"]) | |
## Custom fuzzy match recogniser for list of strings | |
def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]): | |
# Create regex pattern, handling quotes carefully | |
quote_str = '"' | |
replace_str = '(?:"|"|")' | |
custom_regex_pattern = '|'.join( | |
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)' | |
for term in custom_list | |
) | |
# Find all matches in text | |
matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE) | |
start_positions = [] | |
end_positions = [] | |
for match in matches: | |
start_pos = match.start() | |
end_pos = match.end() | |
start_positions.append(start_pos) | |
end_positions.append(end_pos) | |
return start_positions, end_positions | |
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)): | |
''' Conduct fuzzy match on a list of text data.''' | |
all_matches = [] | |
all_start_positions = [] | |
all_end_positions = [] | |
all_ratios = [] | |
#print("custom_query_list:", custom_query_list) | |
if not text: | |
out_message = "No text data found. Skipping page." | |
print(out_message) | |
return all_start_positions, all_end_positions | |
for string_query in custom_query_list: | |
#print("text:", text) | |
#print("string_query:", string_query) | |
query = nlp(string_query) | |
if search_whole_phrase == False: | |
# Keep only words that are not stop words | |
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct] | |
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max) | |
#print("token_query:", token_query) | |
if len(token_query) > 1: | |
#pattern_lemma = [{"LEMMA": {"IN": query}}] | |
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}] | |
else: | |
#pattern_lemma = [{"LEMMA": query[0]}] | |
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}] | |
matcher = Matcher(nlp.vocab) | |
matcher.add(string_query, [pattern_fuzz]) | |
#matcher.add(string_query, [pattern_lemma]) | |
else: | |
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance. | |
#tokenised_query = [string_query.lower()] | |
# If you want to match the whole phrase, use phrase matcher | |
matcher = FuzzyMatcher(nlp.vocab) | |
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object | |
matcher.add("PHRASE", patterns, [{"ignore_case": True}]) | |
batch_size = 256 | |
docs = nlp.pipe([text], batch_size=batch_size) | |
# Get number of matches per doc | |
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"): | |
matches = matcher(doc) | |
match_count = len(matches) | |
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase. | |
if search_whole_phrase==False: | |
all_matches.append(match_count) | |
for match_id, start, end in matches: | |
span = str(doc[start:end]).strip() | |
query_search = str(query).strip() | |
#print("doc:", doc) | |
#print("span:", span) | |
#print("query_search:", query_search) | |
# Convert word positions to character positions | |
start_char = doc[start].idx # Start character position | |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position | |
# The positions here are word position, not character position | |
all_matches.append(match_count) | |
all_start_positions.append(start_char) | |
all_end_positions.append(end_char) | |
else: | |
for match_id, start, end, ratio, pattern in matches: | |
span = str(doc[start:end]).strip() | |
query_search = str(query).strip() | |
#print("doc:", doc) | |
#print("span:", span) | |
#print("query_search:", query_search) | |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes | |
distance = Levenshtein.distance(query_search.lower(), span.lower()) | |
#print("Levenshtein distance:", distance) | |
if distance > spelling_mistakes_max: | |
match_count = match_count - 1 | |
else: | |
# Convert word positions to character positions | |
start_char = doc[start].idx # Start character position | |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position | |
#print("start_char:", start_char) | |
#print("end_char:", end_char) | |
all_matches.append(match_count) | |
all_start_positions.append(start_char) | |
all_end_positions.append(end_char) | |
all_ratios.append(ratio) | |
return all_start_positions, all_end_positions | |
class CustomWordFuzzyRecognizer(EntityRecognizer): | |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True): | |
super().__init__(supported_entities=supported_entities) | |
self.custom_list = custom_list # Store the custom_list as an instance attribute | |
self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes | |
self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag | |
def load(self) -> None: | |
"""No loading is required.""" | |
pass | |
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]: | |
""" | |
Logic for detecting a specific PII | |
""" | |
start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters | |
results = [] | |
for i in range(0, len(start_pos)): | |
result = RecognizerResult( | |
entity_type="CUSTOM_FUZZY", | |
start=start_pos[i], | |
end=end_pos[i], | |
score=1 | |
) | |
results.append(result) | |
return results | |
custom_list_default = [] | |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default) | |
# Create a class inheriting from SpacyNlpEngine | |
class LoadedSpacyNlpEngine(SpacyNlpEngine): | |
def __init__(self, loaded_spacy_model): | |
super().__init__() | |
self.nlp = {"en": loaded_spacy_model} | |
# Pass the loaded model to the new LoadedSpacyNlpEngine | |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp) | |
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine, | |
default_score_threshold=score_threshold, | |
supported_languages=["en"], | |
log_decision_process=False, | |
) | |
# Add custom recognisers to nlp_analyser | |
nlp_analyser.registry.add_recognizer(street_recogniser) | |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser) | |
nlp_analyser.registry.add_recognizer(titles_recogniser) | |
nlp_analyser.registry.add_recognizer(custom_recogniser) | |
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer) | |