Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / tools /load_spacy_model_custom_recognisers.py

Major update. General code revision. Improved config variables. Dataframe based review frame now includes text, items can be searched and excluded. Costs now estimated. Option for adding cost codes added. Option to extract text only.

0ea8b9e 4 months ago

raw

history blame contribute delete

13.7 kB

	from typing import List
	from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
	from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
	import spacy
	from spacy.matcher import Matcher, PhraseMatcher
	from spaczz.matcher import FuzzyMatcher
	spacy.prefer_gpu()
	from spacy.cli.download import download
	import Levenshtein
	import re
	import gradio as gr

	model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
	score_threshold = 0.001
	custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]

	#Load spacy model
	try:
	import en_core_web_lg #en_core_web_sm
	nlp = en_core_web_lg.load() #en_core_web_sm.load()
	print("Successfully imported spaCy model")

	except:
	download(model_name)
	nlp = spacy.load(model_name)
	print("Successfully downloaded and imported spaCy model", model_name)

	# #### Custom recognisers
	def custom_word_list_recogniser(custom_list:List[str]=[]):
	# Create regex pattern, handling quotes carefully

	quote_str = '"'
	replace_str = '(?:"\|"\|")'

	custom_regex = '\|'.join(
	rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
	for term in custom_list
	)
	#print(custom_regex)

	custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)

	custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
	global_regex_flags=re.DOTALL \| re.MULTILINE \| re.IGNORECASE)

	return custom_recogniser

	# Initialise custom recogniser that will be overwritten later
	custom_recogniser = custom_word_list_recogniser()

	# Custom title recogniser
	titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
	titles_regex = '\\b' + '\\b\|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
	titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
	titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
	global_regex_flags=re.DOTALL \| re.MULTILINE)

	# %%
	# Custom postcode recogniser

	# Define the regex pattern in a Presidio `Pattern` object:
	ukpostcode_pattern = Pattern(
	name="ukpostcode_pattern",
	regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\|GIR ?0AA)\b",
	score=1
	)

	# Define the recognizer with one or more patterns
	ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])

	### Street name

	def extract_street_name(text:str) -> str:
	"""
	Extracts the street name and preceding word (that should contain at least one number) from the given text.

	"""

	street_types = [
	'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
	'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
	'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
	'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
	'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
	'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
	'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
	'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
	]

	# Dynamically construct the regex pattern with all possible street types
	street_types_pattern = '\|'.join(rf"{re.escape(street_type)}" for street_type in street_types)

	# The overall regex pattern to capture the street name and preceding word(s)

	pattern = rf'(?P<preceding_word>\w\d\w)\s*'
	pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'

	# Find all matches in text
	matches = re.finditer(pattern, text, re.DOTALL \| re.MULTILINE \| re.IGNORECASE)

	start_positions = []
	end_positions = []

	for match in matches:
	preceding_word = match.group('preceding_word').strip()
	street_name = match.group('street_name').strip()
	start_pos = match.start()
	end_pos = match.end()
	#print(f"Start: {start_pos}, End: {end_pos}")
	#print(f"Preceding words: {preceding_word}")
	#print(f"Street name: {street_name}")

	start_positions.append(start_pos)
	end_positions.append(end_pos)

	return start_positions, end_positions

	class StreetNameRecognizer(EntityRecognizer):

	def load(self) -> None:
	"""No loading is required."""
	pass

	def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
	"""
	Logic for detecting a specific PII
	"""

	start_pos, end_pos = extract_street_name(text)

	results = []

	for i in range(0, len(start_pos)):

	result = RecognizerResult(
	entity_type="STREETNAME",
	start = start_pos[i],
	end = end_pos[i],
	score= 1
	)

	results.append(result)

	return results

	street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])

	## Custom fuzzy match recogniser for list of strings
	def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
	# Create regex pattern, handling quotes carefully

	quote_str = '"'
	replace_str = '(?:"\|"\|")'

	custom_regex_pattern = '\|'.join(
	rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
	for term in custom_list
	)

	# Find all matches in text
	matches = re.finditer(custom_regex_pattern, text, re.DOTALL \| re.MULTILINE \| re.IGNORECASE)

	start_positions = []
	end_positions = []

	for match in matches:
	start_pos = match.start()
	end_pos = match.end()

	start_positions.append(start_pos)
	end_positions.append(end_pos)

	return start_positions, end_positions

	def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
	''' Conduct fuzzy match on a list of text data.'''

	all_matches = []
	all_start_positions = []
	all_end_positions = []
	all_ratios = []

	#print("custom_query_list:", custom_query_list)

	if not text:
	out_message = "No text data found. Skipping page."
	print(out_message)
	return all_start_positions, all_end_positions

	for string_query in custom_query_list:

	#print("text:", text)
	#print("string_query:", string_query)

	query = nlp(string_query)

	if search_whole_phrase == False:
	# Keep only words that are not stop words
	token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]

	spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)

	#print("token_query:", token_query)

	if len(token_query) > 1:
	#pattern_lemma = [{"LEMMA": {"IN": query}}]
	pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
	else:
	#pattern_lemma = [{"LEMMA": query[0]}]
	pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]

	matcher = Matcher(nlp.vocab)
	matcher.add(string_query, [pattern_fuzz])
	#matcher.add(string_query, [pattern_lemma])

	else:
	# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
	#tokenised_query = [string_query.lower()]
	# If you want to match the whole phrase, use phrase matcher
	matcher = FuzzyMatcher(nlp.vocab)
	patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
	matcher.add("PHRASE", patterns, [{"ignore_case": True}])

	batch_size = 256
	docs = nlp.pipe([text], batch_size=batch_size)

	# Get number of matches per doc
	for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
	matches = matcher(doc)
	match_count = len(matches)

	# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
	if search_whole_phrase==False:
	all_matches.append(match_count)

	for match_id, start, end in matches:
	span = str(doc[start:end]).strip()
	query_search = str(query).strip()
	#print("doc:", doc)
	#print("span:", span)
	#print("query_search:", query_search)

	# Convert word positions to character positions
	start_char = doc[start].idx # Start character position
	end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position

	# The positions here are word position, not character position
	all_matches.append(match_count)
	all_start_positions.append(start_char)
	all_end_positions.append(end_char)

	else:
	for match_id, start, end, ratio, pattern in matches:
	span = str(doc[start:end]).strip()
	query_search = str(query).strip()
	#print("doc:", doc)
	#print("span:", span)
	#print("query_search:", query_search)

	# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
	distance = Levenshtein.distance(query_search.lower(), span.lower())

	#print("Levenshtein distance:", distance)

	if distance > spelling_mistakes_max:
	match_count = match_count - 1
	else:
	# Convert word positions to character positions
	start_char = doc[start].idx # Start character position
	end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position

	#print("start_char:", start_char)
	#print("end_char:", end_char)

	all_matches.append(match_count)
	all_start_positions.append(start_char)
	all_end_positions.append(end_char)
	all_ratios.append(ratio)


	return all_start_positions, all_end_positions


	class CustomWordFuzzyRecognizer(EntityRecognizer):
	def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
	super().__init__(supported_entities=supported_entities)
	self.custom_list = custom_list # Store the custom_list as an instance attribute
	self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
	self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag

	def load(self) -> None:
	"""No loading is required."""
	pass

	def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
	"""
	Logic for detecting a specific PII
	"""
	start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters

	results = []

	for i in range(0, len(start_pos)):
	result = RecognizerResult(
	entity_type="CUSTOM_FUZZY",
	start=start_pos[i],
	end=end_pos[i],
	score=1
	)
	results.append(result)

	return results

	custom_list_default = []
	custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)

	# Create a class inheriting from SpacyNlpEngine
	class LoadedSpacyNlpEngine(SpacyNlpEngine):
	def __init__(self, loaded_spacy_model):
	super().__init__()
	self.nlp = {"en": loaded_spacy_model}

	# Pass the loaded model to the new LoadedSpacyNlpEngine
	loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)


	nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
	default_score_threshold=score_threshold,
	supported_languages=["en"],
	log_decision_process=False,
	)

	# Add custom recognisers to nlp_analyser
	nlp_analyser.registry.add_recognizer(street_recogniser)
	nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
	nlp_analyser.registry.add_recognizer(titles_recogniser)
	nlp_analyser.registry.add_recognizer(custom_recogniser)
	nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)