Spaces:

KarishmaShirsath
/

PIIMasking

Sleeping

App Files Files Community

KarishmaShirsath commited on Apr 10, 2024

Commit

da8438c

verified ·

1 Parent(s): ab73abb

Add de-identification options

Browse files

Files changed (3) hide show

Final_file.py +124 -16
app.py +22 -2
requirements.txt +0 -0

Final_file.py CHANGED Viewed

@@ -347,7 +347,7 @@ val_dataset = (
     .padded_batch(batch_size)
 )
-ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
 # In[15]:
@@ -367,14 +367,14 @@ class CustomNonPaddingTokenLoss(keras.losses.Loss):
         return tf.reduce_sum(loss) / tf.reduce_sum(mask)
-loss = CustomNonPaddingTokenLoss()
 # In[16]:
-ner_model.compile(optimizer="adam", loss=loss)
-ner_model.fit(train_dataset, epochs=10)
 def tokenize_and_convert_to_ids(text):
@@ -383,18 +383,18 @@ def tokenize_and_convert_to_ids(text):
 # Sample inference using the trained model
-sample_input = tokenize_and_convert_to_ids(
-    "eu rejects german call to boycott british lamb"
-)
-sample_input = tf.reshape(sample_input, shape=[1, -1])
-print(sample_input)
-output = ner_model.predict(sample_input)
-prediction = np.argmax(output, axis=-1)[0]
-prediction = [mapping[i] for i in prediction]
 # eu -> B-ORG, german -> B-MISC, british -> B-MISC
-print(prediction)
 # In[17]:
@@ -426,7 +426,7 @@ def calculate_metrics(_dataset):
     evaluate(real_tags, predicted_tags)
-calculate_metrics(val_dataset)
 # In[18]:
@@ -450,7 +450,7 @@ def test_model_with_input(_ner_model, mapping):
     print("Predicted tags:", predicted_tags)
 # Test the model with user input
-test_model_with_input(ner_model, mapping)
 # In[20]:
@@ -693,7 +693,115 @@ class FlairRecognizer(EntityRecognizer):
             anonymized_text = "*" * len(entity_text)
             anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
         # print anonymized sentence
         print("Anonymized sentence:")
         print(anonymized_sentence)
-        return anonymized_sentence

     .padded_batch(batch_size)
 )
+# ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
 # In[15]:
         return tf.reduce_sum(loss) / tf.reduce_sum(mask)
+# loss = CustomNonPaddingTokenLoss()
 # In[16]:
+# ner_model.compile(optimizer="adam", loss=loss)
+# ner_model.fit(train_dataset, epochs=10)
 def tokenize_and_convert_to_ids(text):
 # Sample inference using the trained model
+# sample_input = tokenize_and_convert_to_ids(
+#     "eu rejects german call to boycott british lamb"
+# )
+# sample_input = tf.reshape(sample_input, shape=[1, -1])
+# print(sample_input)
+# output = ner_model.predict(sample_input)
+# prediction = np.argmax(output, axis=-1)[0]
+# prediction = [mapping[i] for i in prediction]
 # eu -> B-ORG, german -> B-MISC, british -> B-MISC
+# print(prediction)
 # In[17]:
     evaluate(real_tags, predicted_tags)
+# calculate_metrics(val_dataset)
 # In[18]:
     print("Predicted tags:", predicted_tags)
 # Test the model with user input
+# test_model_with_input(ner_model, mapping)
 # In[20]:
             anonymized_text = "*" * len(entity_text)
             anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
+        # remove the part that includes named entity annotations
+        anonymized_sentence = anonymized_sentence.split("→")[0].strip()
+        anonymized_sentence = anonymized_sentence.split(":")[1].strip()
+        a = anonymize(input_text, "", anonymized_sentence)
+        print("a sentence:")
+        print(a)
         # print anonymized sentence
         print("Anonymized sentence:")
         print(anonymized_sentence)
+        return anonymized_sentence
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer.entities import (
+    OperatorConfig,
+    RecognizerResult,
+    EngineResult,
+    ConflictResolutionStrategy,
+)
+from typing import List, Dict, Optional, Type
+class FlairRecognizer2():
+    def anonymize(
+        text: str,
+        operator: str,
+        # analyze_results: List[RecognizerResult],
+        mask_char: Optional[str] = None,
+        number_of_chars: Optional[str] = None,
+        encrypt_key: Optional[str] = None,
+    ):
+        """Anonymize identified input using Presidio Anonymizer.
+        :param text: Full text
+        :param operator: Operator name
+        :param mask_char: Mask char (for mask operator)
+        :param number_of_chars: Number of characters to mask (for mask operator)
+        :param encrypt_key: Encryption key (for encrypt operator)
+        :param analyze_results: list of results from presidio analyzer engine
+        """
+        if operator == "mask":
+            operator_config = {
+                "type": "mask",
+                "masking_char": mask_char,
+                "chars_to_mask": number_of_chars,
+                "from_end": False,
+            }
+        # Define operator config
+        elif operator == "encrypt":
+            operator_config = {"key": encrypt_key}
+        elif operator == "highlight":
+            operator_config = {"lambda": lambda x: x}
+        else:
+            operator_config = None
+        # Change operator if needed as intermediate step
+        if operator == "highlight":
+            operator = "custom"
+        elif operator == "synthesize":
+            operator = "replace"
+        else:
+            operator = operator
+        # res = AnonymizerEngine().anonymize(
+        #     text,
+        #     analyze_results,
+        #     operators={"DEFAULT": OperatorConfig("redact", operator_config)},
+        # )
+        analyzer = AnalyzerEngine()
+        results = analyzer.analyze(text=text, entities=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION'], language='en') # noqa D501
+        print("results:")
+        print(results)
+        engine = AnonymizerEngine()
+            # Invoke the anonymize function with the text, analyzer results and
+            # Operators to define the anonymization type.
+        result = engine.anonymize(
+            text=text,
+            analyzer_results=results,
+            operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
+        )
+        print("res:")
+        print(result)
+        print(result.text)
+        print(type(result.text))
+        return result.text

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from Final_file import FlairRecognizer
 import os
 import PyPDF2
 import docx
@@ -15,9 +16,13 @@ def cached_predict_ner_tags(text):
 # Cache the text analysis function
 @st.cache_resource
-def cached_analyze_text(text):
     return FlairRecognizer.analyze_text(text)
 def download_masked_file(masked_text, file_extension):
     # Create a temporary file to store the masked text
@@ -66,6 +71,21 @@ def main():
     st.sidebar.header('Upload Options')
     upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
     # # Dropdown menu with four choices
     # st.sidebar.header('Masking Options')
     # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
@@ -75,7 +95,7 @@ def main():
         if st.button('Analyze'):
             with st.spinner('Wait for it... the model is loading'):
                 cached_predict_ner_tags(input_text)
-                masked_text = cached_analyze_text(input_text)
             st.text_area("Masked text:", value=masked_text, height=200)
     elif upload_option == 'File Upload':
         uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])

 import streamlit as st
 from Final_file import FlairRecognizer
+from Final_file import FlairRecognizer2
 import os
 import PyPDF2
 import docx
 # Cache the text analysis function
 @st.cache_resource
+def cached_analyze_text(text, operator):
     return FlairRecognizer.analyze_text(text)
+@st.cache_resource
+def cached_anonimize_text(text, operator):
+    return FlairRecognizer2.anonymize(text, operator)
 def download_masked_file(masked_text, file_extension):
     # Create a temporary file to store the masked text
     st.sidebar.header('Upload Options')
     upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
+    st_operator = st.sidebar.selectbox(
+        "De-identification approach",
+        ["redact", "replace", "hash"],
+        index=1,
+        help="""
+        Select which manipulation to the text is requested after PII has been identified.\n
+        - Redact: Completely remove the PII text\n
+        - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
+        - Synthesize: Replace with fake values (requires an OpenAI key)\n
+        - Highlight: Shows the original text with PII highlighted in colors\n
+        - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
+        - Hash: Replaces with the hash of the PII string\n
+        - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
+            """,
+    )
     # # Dropdown menu with four choices
     # st.sidebar.header('Masking Options')
     # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
         if st.button('Analyze'):
             with st.spinner('Wait for it... the model is loading'):
                 cached_predict_ner_tags(input_text)
+                masked_text = cached_anonimize_text(input_text, st_operator)
             st.text_area("Masked text:", value=masked_text, height=200)
     elif upload_option == 'File Upload':
         uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ