|
import uuid |
|
|
|
def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None): |
|
"""Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing.""" |
|
processed_tokens = [] |
|
for token in tokens: |
|
if not token.strip() or not re.match(r"\w+", token): |
|
processed_tokens.append(token) |
|
continue |
|
if inverse_uuid_map is not None: |
|
processed_tokens.append(inverse_uuid_map.get(token, token)) |
|
elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None: |
|
x = embeddings_model.wv[token][None] |
|
prediction_proba = fhe_ner_detection.predict_proba(x) |
|
probability = prediction_proba[0][1] |
|
if probability >= 0.5: |
|
tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8]) |
|
processed_tokens.append(tmp_uuid) |
|
uuid_map[token] = tmp_uuid |
|
else: |
|
processed_tokens.append(token) |
|
else: |
|
processed_tokens.append(token) |
|
return ''.join(processed_tokens) |