Spaces:
Sleeping
Sleeping
Add de-identification options
Browse files- Final_file.py +124 -16
- app.py +22 -2
- requirements.txt +0 -0
Final_file.py
CHANGED
|
@@ -347,7 +347,7 @@ val_dataset = (
|
|
| 347 |
.padded_batch(batch_size)
|
| 348 |
)
|
| 349 |
|
| 350 |
-
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
|
| 351 |
|
| 352 |
|
| 353 |
# In[15]:
|
|
@@ -367,14 +367,14 @@ class CustomNonPaddingTokenLoss(keras.losses.Loss):
|
|
| 367 |
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
| 368 |
|
| 369 |
|
| 370 |
-
loss = CustomNonPaddingTokenLoss()
|
| 371 |
|
| 372 |
|
| 373 |
# In[16]:
|
| 374 |
|
| 375 |
|
| 376 |
-
ner_model.compile(optimizer="adam", loss=loss)
|
| 377 |
-
ner_model.fit(train_dataset, epochs=10)
|
| 378 |
|
| 379 |
|
| 380 |
def tokenize_and_convert_to_ids(text):
|
|
@@ -383,18 +383,18 @@ def tokenize_and_convert_to_ids(text):
|
|
| 383 |
|
| 384 |
|
| 385 |
# Sample inference using the trained model
|
| 386 |
-
sample_input = tokenize_and_convert_to_ids(
|
| 387 |
-
|
| 388 |
-
)
|
| 389 |
-
sample_input = tf.reshape(sample_input, shape=[1, -1])
|
| 390 |
-
print(sample_input)
|
| 391 |
|
| 392 |
-
output = ner_model.predict(sample_input)
|
| 393 |
-
prediction = np.argmax(output, axis=-1)[0]
|
| 394 |
-
prediction = [mapping[i] for i in prediction]
|
| 395 |
|
| 396 |
# eu -> B-ORG, german -> B-MISC, british -> B-MISC
|
| 397 |
-
print(prediction)
|
| 398 |
|
| 399 |
|
| 400 |
# In[17]:
|
|
@@ -426,7 +426,7 @@ def calculate_metrics(_dataset):
|
|
| 426 |
evaluate(real_tags, predicted_tags)
|
| 427 |
|
| 428 |
|
| 429 |
-
calculate_metrics(val_dataset)
|
| 430 |
|
| 431 |
|
| 432 |
# In[18]:
|
|
@@ -450,7 +450,7 @@ def test_model_with_input(_ner_model, mapping):
|
|
| 450 |
print("Predicted tags:", predicted_tags)
|
| 451 |
|
| 452 |
# Test the model with user input
|
| 453 |
-
test_model_with_input(ner_model, mapping)
|
| 454 |
|
| 455 |
|
| 456 |
# In[20]:
|
|
@@ -693,7 +693,115 @@ class FlairRecognizer(EntityRecognizer):
|
|
| 693 |
anonymized_text = "*" * len(entity_text)
|
| 694 |
anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
|
| 695 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
# print anonymized sentence
|
| 697 |
print("Anonymized sentence:")
|
| 698 |
print(anonymized_sentence)
|
| 699 |
-
return anonymized_sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
.padded_batch(batch_size)
|
| 348 |
)
|
| 349 |
|
| 350 |
+
# ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
|
| 351 |
|
| 352 |
|
| 353 |
# In[15]:
|
|
|
|
| 367 |
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
| 368 |
|
| 369 |
|
| 370 |
+
# loss = CustomNonPaddingTokenLoss()
|
| 371 |
|
| 372 |
|
| 373 |
# In[16]:
|
| 374 |
|
| 375 |
|
| 376 |
+
# ner_model.compile(optimizer="adam", loss=loss)
|
| 377 |
+
# ner_model.fit(train_dataset, epochs=10)
|
| 378 |
|
| 379 |
|
| 380 |
def tokenize_and_convert_to_ids(text):
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
# Sample inference using the trained model
|
| 386 |
+
# sample_input = tokenize_and_convert_to_ids(
|
| 387 |
+
# "eu rejects german call to boycott british lamb"
|
| 388 |
+
# )
|
| 389 |
+
# sample_input = tf.reshape(sample_input, shape=[1, -1])
|
| 390 |
+
# print(sample_input)
|
| 391 |
|
| 392 |
+
# output = ner_model.predict(sample_input)
|
| 393 |
+
# prediction = np.argmax(output, axis=-1)[0]
|
| 394 |
+
# prediction = [mapping[i] for i in prediction]
|
| 395 |
|
| 396 |
# eu -> B-ORG, german -> B-MISC, british -> B-MISC
|
| 397 |
+
# print(prediction)
|
| 398 |
|
| 399 |
|
| 400 |
# In[17]:
|
|
|
|
| 426 |
evaluate(real_tags, predicted_tags)
|
| 427 |
|
| 428 |
|
| 429 |
+
# calculate_metrics(val_dataset)
|
| 430 |
|
| 431 |
|
| 432 |
# In[18]:
|
|
|
|
| 450 |
print("Predicted tags:", predicted_tags)
|
| 451 |
|
| 452 |
# Test the model with user input
|
| 453 |
+
# test_model_with_input(ner_model, mapping)
|
| 454 |
|
| 455 |
|
| 456 |
# In[20]:
|
|
|
|
| 693 |
anonymized_text = "*" * len(entity_text)
|
| 694 |
anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
|
| 695 |
|
| 696 |
+
# remove the part that includes named entity annotations
|
| 697 |
+
anonymized_sentence = anonymized_sentence.split("→")[0].strip()
|
| 698 |
+
anonymized_sentence = anonymized_sentence.split(":")[1].strip()
|
| 699 |
+
|
| 700 |
+
a = anonymize(input_text, "", anonymized_sentence)
|
| 701 |
+
print("a sentence:")
|
| 702 |
+
print(a)
|
| 703 |
+
|
| 704 |
# print anonymized sentence
|
| 705 |
print("Anonymized sentence:")
|
| 706 |
print(anonymized_sentence)
|
| 707 |
+
return anonymized_sentence
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
from presidio_anonymizer import AnonymizerEngine
|
| 719 |
+
from presidio_analyzer import AnalyzerEngine
|
| 720 |
+
from presidio_anonymizer.entities import (
|
| 721 |
+
OperatorConfig,
|
| 722 |
+
RecognizerResult,
|
| 723 |
+
EngineResult,
|
| 724 |
+
ConflictResolutionStrategy,
|
| 725 |
+
)
|
| 726 |
+
from typing import List, Dict, Optional, Type
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
class FlairRecognizer2():
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
def anonymize(
|
| 733 |
+
text: str,
|
| 734 |
+
operator: str,
|
| 735 |
+
# analyze_results: List[RecognizerResult],
|
| 736 |
+
mask_char: Optional[str] = None,
|
| 737 |
+
number_of_chars: Optional[str] = None,
|
| 738 |
+
encrypt_key: Optional[str] = None,
|
| 739 |
+
):
|
| 740 |
+
"""Anonymize identified input using Presidio Anonymizer.
|
| 741 |
+
:param text: Full text
|
| 742 |
+
:param operator: Operator name
|
| 743 |
+
:param mask_char: Mask char (for mask operator)
|
| 744 |
+
:param number_of_chars: Number of characters to mask (for mask operator)
|
| 745 |
+
:param encrypt_key: Encryption key (for encrypt operator)
|
| 746 |
+
:param analyze_results: list of results from presidio analyzer engine
|
| 747 |
+
"""
|
| 748 |
+
|
| 749 |
+
if operator == "mask":
|
| 750 |
+
operator_config = {
|
| 751 |
+
"type": "mask",
|
| 752 |
+
"masking_char": mask_char,
|
| 753 |
+
"chars_to_mask": number_of_chars,
|
| 754 |
+
"from_end": False,
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
# Define operator config
|
| 758 |
+
elif operator == "encrypt":
|
| 759 |
+
operator_config = {"key": encrypt_key}
|
| 760 |
+
elif operator == "highlight":
|
| 761 |
+
operator_config = {"lambda": lambda x: x}
|
| 762 |
+
else:
|
| 763 |
+
operator_config = None
|
| 764 |
+
|
| 765 |
+
# Change operator if needed as intermediate step
|
| 766 |
+
if operator == "highlight":
|
| 767 |
+
operator = "custom"
|
| 768 |
+
elif operator == "synthesize":
|
| 769 |
+
operator = "replace"
|
| 770 |
+
else:
|
| 771 |
+
operator = operator
|
| 772 |
+
|
| 773 |
+
# res = AnonymizerEngine().anonymize(
|
| 774 |
+
# text,
|
| 775 |
+
# analyze_results,
|
| 776 |
+
# operators={"DEFAULT": OperatorConfig("redact", operator_config)},
|
| 777 |
+
# )
|
| 778 |
+
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
analyzer = AnalyzerEngine()
|
| 782 |
+
|
| 783 |
+
results = analyzer.analyze(text=text, entities=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION'], language='en') # noqa D501
|
| 784 |
+
print("results:")
|
| 785 |
+
print(results)
|
| 786 |
+
|
| 787 |
+
engine = AnonymizerEngine()
|
| 788 |
+
|
| 789 |
+
# Invoke the anonymize function with the text, analyzer results and
|
| 790 |
+
# Operators to define the anonymization type.
|
| 791 |
+
result = engine.anonymize(
|
| 792 |
+
text=text,
|
| 793 |
+
analyzer_results=results,
|
| 794 |
+
operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
|
| 795 |
+
)
|
| 796 |
+
print("res:")
|
| 797 |
+
print(result)
|
| 798 |
+
print(result.text)
|
| 799 |
+
print(type(result.text))
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
return result.text
|
| 803 |
+
|
| 804 |
+
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
|
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from Final_file import FlairRecognizer
|
|
|
|
| 3 |
import os
|
| 4 |
import PyPDF2
|
| 5 |
import docx
|
|
@@ -15,9 +16,13 @@ def cached_predict_ner_tags(text):
|
|
| 15 |
|
| 16 |
# Cache the text analysis function
|
| 17 |
@st.cache_resource
|
| 18 |
-
def cached_analyze_text(text):
|
| 19 |
return FlairRecognizer.analyze_text(text)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def download_masked_file(masked_text, file_extension):
|
| 22 |
|
| 23 |
# Create a temporary file to store the masked text
|
|
@@ -66,6 +71,21 @@ def main():
|
|
| 66 |
st.sidebar.header('Upload Options')
|
| 67 |
upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# # Dropdown menu with four choices
|
| 70 |
# st.sidebar.header('Masking Options')
|
| 71 |
# choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
|
|
@@ -75,7 +95,7 @@ def main():
|
|
| 75 |
if st.button('Analyze'):
|
| 76 |
with st.spinner('Wait for it... the model is loading'):
|
| 77 |
cached_predict_ner_tags(input_text)
|
| 78 |
-
masked_text =
|
| 79 |
st.text_area("Masked text:", value=masked_text, height=200)
|
| 80 |
elif upload_option == 'File Upload':
|
| 81 |
uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from Final_file import FlairRecognizer
|
| 3 |
+
from Final_file import FlairRecognizer2
|
| 4 |
import os
|
| 5 |
import PyPDF2
|
| 6 |
import docx
|
|
|
|
| 16 |
|
| 17 |
# Cache the text analysis function
|
| 18 |
@st.cache_resource
|
| 19 |
+
def cached_analyze_text(text, operator):
|
| 20 |
return FlairRecognizer.analyze_text(text)
|
| 21 |
|
| 22 |
+
@st.cache_resource
|
| 23 |
+
def cached_anonimize_text(text, operator):
|
| 24 |
+
return FlairRecognizer2.anonymize(text, operator)
|
| 25 |
+
|
| 26 |
def download_masked_file(masked_text, file_extension):
|
| 27 |
|
| 28 |
# Create a temporary file to store the masked text
|
|
|
|
| 71 |
st.sidebar.header('Upload Options')
|
| 72 |
upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
|
| 73 |
|
| 74 |
+
st_operator = st.sidebar.selectbox(
|
| 75 |
+
"De-identification approach",
|
| 76 |
+
["redact", "replace", "hash"],
|
| 77 |
+
index=1,
|
| 78 |
+
help="""
|
| 79 |
+
Select which manipulation to the text is requested after PII has been identified.\n
|
| 80 |
+
- Redact: Completely remove the PII text\n
|
| 81 |
+
- Replace: Replace the PII text with a constant, e.g. <PERSON>\n
|
| 82 |
+
- Synthesize: Replace with fake values (requires an OpenAI key)\n
|
| 83 |
+
- Highlight: Shows the original text with PII highlighted in colors\n
|
| 84 |
+
- Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
|
| 85 |
+
- Hash: Replaces with the hash of the PII string\n
|
| 86 |
+
- Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
|
| 87 |
+
""",
|
| 88 |
+
)
|
| 89 |
# # Dropdown menu with four choices
|
| 90 |
# st.sidebar.header('Masking Options')
|
| 91 |
# choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
|
|
|
|
| 95 |
if st.button('Analyze'):
|
| 96 |
with st.spinner('Wait for it... the model is loading'):
|
| 97 |
cached_predict_ner_tags(input_text)
|
| 98 |
+
masked_text = cached_anonimize_text(input_text, st_operator)
|
| 99 |
st.text_area("Masked text:", value=masked_text, height=200)
|
| 100 |
elif upload_option == 'File Upload':
|
| 101 |
uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|