Spaces:

langdonholmes
/

piilo

Running

App Files Files Community

langdonholmes commited on Feb 26, 2023

Commit

0c03168

1 Parent(s): 22bf201

Use nameparser

Browse files

Files changed (9) hide show

Pipfile +1 -0
Pipfile.lock +9 -1
anonymizer.py +0 -153
piilo/__init__.py +0 -0
analyzer.py → piilo/analyzer.py +0 -0
main.py → piilo/main.py +0 -0
{models → piilo/models}/anonymize.py +0 -0
names_database.py → piilo/names_database.py +0 -0
test_main.py → piilo/test_main.py +0 -0

Pipfile CHANGED Viewed

@@ -18,6 +18,7 @@ names-dataset = "*"
 fastapi = "*"
 httpx = "*"
 uvicorn = "*"
 [dev-packages]

 fastapi = "*"
 httpx = "*"
 uvicorn = "*"
+nameparser = "*"
 [dev-packages]

Pipfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "9cf8ce38b07b8e9be412869628fa94aa5e8444cfda715ed26c2dc73d547e2d9a"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -503,6 +503,14 @@
             "markers": "python_version >= '3.6'",
             "version": "==1.0.9"
         },
         "names-dataset": {
             "hashes": [
                 "sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"

 {
     "_meta": {
         "hash": {
+            "sha256": "256f4e2bde3a1d4e60d68e505515f71f0c4e11292e4c7d9d826e08aac629579b"
         },
         "pipfile-spec": 6,
         "requires": {
             "markers": "python_version >= '3.6'",
             "version": "==1.0.9"
         },
+        "nameparser": {
+            "hashes": [
+                "sha256:ea2e01d1d9d04c0648be230f161f27316a1b5be431a1cc64e8799fac548fb3bc",
+                "sha256:f4b6c7c1048d528bd6aa2b27cf42a06447d2b31e45a95b20449513078f1d86ef"
+            ],
+            "index": "pypi",
+            "version": "==1.1.2"
+        },
         "names-dataset": {
             "hashes": [
                 "sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"

anonymizer.py DELETED Viewed

@@ -1,153 +0,0 @@
-import logging
-from pathlib import Path
-from typing import List, Optional, Tuple
-import pandas as pd
-from presidio_analyzer import RecognizerResult
-from presidio_anonymizer import AnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig
-from presidio_anonymizer.operators import OperatorType
-from names_database import NameDatabase
-name_table = Path('data', 'ascii_names.parquet')
-logger = logging.getLogger('anonymizer')
-class surrogate_anonymizer(AnonymizerEngine):
-    def __init__(self):
-        super().__init__()
-        self.names_db = NameDatabase()
-        self.names_df = pd.read_parquet(name_table)
-        # keep track of names we have seen
-        self.seen_names = dict()
-    def get_random_name(
-            self,
-            country: Optional[str] = None,
-            gender: Optional[str] = None
-    ) -> pd.DataFrame:
-        '''Returns two random names from the database as a DataFrame.
-        Both rows match gender and country, if provided.
-        :country: ISO country code e.g. "CO" for Columbia
-        :gender: 'M' or 'F'
-        returns two rows of the names dataframe
-        '''
-        names_view = self.names_df
-        if country:
-            names_view = names_view[names_view['country'] == country]
-        if gender:
-            names_view = names_view[names_view['gender'] == gender]
-        if names_view.size < 25:
-            return self.names_df.sample(n=2, weights=self.names_df['count'])
-        return names_view.sample(n=2, weights=names_view['count'])
-    def split_name(self, original_name: str) -> Tuple[str]:
-        '''Splits name into parts.
-        If one token, assume it is a first name.
-        If two tokens, first and last name.
-        If three tokens, one first name and two last names.
-        If four tokens, two first names and two last names.'''
-        names = original_name.split()
-        if len(names) == 1:
-            logger.info(f'Splitting to 1 first name: {names}')
-            return names[0], None
-        elif len(names) == 2:
-            logger.info(f'Splitting to 1 first name, 1 last name: {names}')
-            return names[0], names[1]
-        elif len(names) == 3:
-            logger.info(f'Splitting to 1 first name, 2 last names: {names}')
-            return names[0], ' '.join(names[1:])
-        elif len(names) == 4:
-            logger.info(f'Splitting to 2 first names and 2 last names: {names}')
-            return ' '.join(names[:2]), ' '.join(names[2:])
-        else:
-            logger.info(f'Splitting failed, do not match gender/country: {names}')
-            return None, None
-    def generate_surrogate(self, original_name: str) -> str:
-        '''Generate a surrogate name.
-        '''
-        if original_name == 'PII':
-            # Every time we call this function, Presidio will validate it
-            # by testing that the function returns a str when the input is
-            # 'PII'. Bypass this test.
-            return 'PII'
-        # If we have seen this name before, return the same surrogate
-        if original_name in self.seen_names:
-            return self.seen_names[original_name]
-        first_names, last_names = self.split_name(original_name)
-        gender = self.names_db.get_gender(first_names) if first_names else None
-        logger.debug(f'Gender set to {gender}')
-        country = self.names_db.get_country(last_names) if last_names else None
-        logger.debug(f'Country set to {country}')
-        surrogate_name = ''
-        name_candidates = self.get_random_name(gender=gender, country=country)
-        surrogate_name += name_candidates.iloc[0]['first']
-        logger.info(f'First name surrogate is {surrogate_name}')
-        if last_names:
-            logger.info(f'Combining with {name_candidates.iloc[1]["last"]}')
-            surrogate_name += ' ' + name_candidates.iloc[1]['last']
-        logger.info(f'Returning surrogate name {surrogate_name}')
-        self.seen_names[original_name] = surrogate_name
-        return surrogate_name
-    def anonymize(
-        self,
-        text: str,
-        analyzer_results: List[RecognizerResult]
-        ):
-        '''Anonymize identified input using Presidio Anonymizer.'''
-        if not text:
-            return
-        analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
-            analyzer_results
-        )
-        operators = self._AnonymizerEngine__check_or_add_default_operator(
-            {
-            'STUDENT': OperatorConfig('custom',
-                                      {'lambda': self.generate_surrogate}),
-            'EMAIL_ADDRESS': OperatorConfig('replace',
-                                            {'new_value': '[email protected]'}),
-            'PHONE_NUMBER': OperatorConfig('replace',
-                                           {'new_value': '888-888-8888'}),
-            'URL': OperatorConfig('replace',
-                                  {'new_value': 'aol.com'}),
-            }
-        )
-        res = self._operate(text,
-                            analyzer_results,
-                            operators,
-                            OperatorType.Anonymize)
-        return res.text
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
-    anonymizer = surrogate_anonymizer()
-    test_names = ['Nora Wang',
-                  'MJ',
-                  '',
-                  '(',
-                  'Mario Escobar Sanchez',
-                  'Jane Fonda Michelle Rousseau',
-                  'Sir Phillipe Ricardo de la Sota Mayor']
-    for name in test_names:
-        anonymizer.generate_surrogate(name)

piilo/__init__.py ADDED Viewed

File without changes

analyzer.py → piilo/analyzer.py RENAMED Viewed

File without changes

main.py → piilo/main.py RENAMED Viewed

File without changes

{models → piilo/models}/anonymize.py RENAMED Viewed

File without changes

names_database.py → piilo/names_database.py RENAMED Viewed

File without changes

test_main.py → piilo/test_main.py RENAMED Viewed

File without changes