Spaces:
Running
Running
langdonholmes
commited on
Commit
·
b0e291c
1
Parent(s):
6449ca4
inherited from names_database
Browse files- .gitignore +1 -3
- Pipfile +1 -0
- Pipfile.lock +53 -39
- data/ascii_fb_names_small.parquet +3 -0
- match_replace.py +26 -65
- names_database.py +28 -0
.gitignore
CHANGED
@@ -1,3 +1 @@
|
|
1 |
-
__pycache__
|
2 |
-
__pycache__/spacy_analyzer.cpython-310.pyc
|
3 |
-
__pycache__/spacy_recognizer.cpython-310.pyc
|
|
|
1 |
+
__pycache__/*
|
|
|
|
Pipfile
CHANGED
@@ -14,6 +14,7 @@ streamlit = "==1.17.0"
|
|
14 |
tokenizers = "==0.12.1"
|
15 |
torch = "==1.12.0"
|
16 |
en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
|
|
|
17 |
|
18 |
[dev-packages]
|
19 |
|
|
|
14 |
tokenizers = "==0.12.1"
|
15 |
torch = "==1.12.0"
|
16 |
en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
|
17 |
+
names-dataset = "*"
|
18 |
|
19 |
[dev-packages]
|
20 |
|
Pipfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"_meta": {
|
3 |
"hash": {
|
4 |
-
"sha256": "
|
5 |
},
|
6 |
"pipfile-spec": 6,
|
7 |
"requires": {
|
@@ -463,39 +463,46 @@
|
|
463 |
"markers": "python_version >= '3.6'",
|
464 |
"version": "==1.0.9"
|
465 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
"numpy": {
|
467 |
"hashes": [
|
468 |
-
"sha256:
|
469 |
-
"sha256:
|
470 |
-
"sha256:
|
471 |
-
"sha256:
|
472 |
-
"sha256:
|
473 |
-
"sha256:
|
474 |
-
"sha256:
|
475 |
-
"sha256:
|
476 |
-
"sha256:
|
477 |
-
"sha256:
|
478 |
-
"sha256:
|
479 |
-
"sha256:
|
480 |
-
"sha256:
|
481 |
-
"sha256:
|
482 |
-
"sha256:
|
483 |
-
"sha256:
|
484 |
-
"sha256:
|
485 |
-
"sha256:
|
486 |
-
"sha256:
|
487 |
-
"sha256:
|
488 |
-
"sha256:
|
489 |
-
"sha256:
|
490 |
-
"sha256:
|
491 |
-
"sha256:
|
492 |
-
"sha256:
|
493 |
-
"sha256:
|
494 |
-
"sha256:
|
495 |
-
"sha256:
|
496 |
],
|
497 |
"markers": "python_version >= '3.10'",
|
498 |
-
"version": "==1.24.
|
499 |
},
|
500 |
"packaging": {
|
501 |
"hashes": [
|
@@ -542,10 +549,10 @@
|
|
542 |
},
|
543 |
"phonenumbers": {
|
544 |
"hashes": [
|
545 |
-
"sha256:
|
546 |
-
"sha256:
|
547 |
],
|
548 |
-
"version": "==8.13.
|
549 |
},
|
550 |
"pillow": {
|
551 |
"hashes": [
|
@@ -737,6 +744,13 @@
|
|
737 |
"markers": "python_version >= '3.7'",
|
738 |
"version": "==11.0.0"
|
739 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
"pycryptodome": {
|
741 |
"hashes": [
|
742 |
"sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
|
@@ -1070,11 +1084,11 @@
|
|
1070 |
},
|
1071 |
"setuptools": {
|
1072 |
"hashes": [
|
1073 |
-
"sha256:
|
1074 |
-
"sha256:
|
1075 |
],
|
1076 |
"markers": "python_version >= '3.7'",
|
1077 |
-
"version": "==67.
|
1078 |
},
|
1079 |
"six": {
|
1080 |
"hashes": [
|
@@ -1480,11 +1494,11 @@
|
|
1480 |
},
|
1481 |
"zipp": {
|
1482 |
"hashes": [
|
1483 |
-
"sha256:
|
1484 |
-
"sha256:
|
1485 |
],
|
1486 |
"markers": "python_version >= '3.7'",
|
1487 |
-
"version": "==3.
|
1488 |
}
|
1489 |
},
|
1490 |
"develop": {}
|
|
|
1 |
{
|
2 |
"_meta": {
|
3 |
"hash": {
|
4 |
+
"sha256": "6a4aa8c782c5b5fd8f5f0b3d7ba6cb6541f37295823bdee26d3fd575533c5999"
|
5 |
},
|
6 |
"pipfile-spec": 6,
|
7 |
"requires": {
|
|
|
463 |
"markers": "python_version >= '3.6'",
|
464 |
"version": "==1.0.9"
|
465 |
},
|
466 |
+
"names-dataset": {
|
467 |
+
"hashes": [
|
468 |
+
"sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"
|
469 |
+
],
|
470 |
+
"index": "pypi",
|
471 |
+
"version": "==3.1.0"
|
472 |
+
},
|
473 |
"numpy": {
|
474 |
"hashes": [
|
475 |
+
"sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
|
476 |
+
"sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
|
477 |
+
"sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
|
478 |
+
"sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
|
479 |
+
"sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
|
480 |
+
"sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
|
481 |
+
"sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
|
482 |
+
"sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
|
483 |
+
"sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
|
484 |
+
"sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
|
485 |
+
"sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
|
486 |
+
"sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
|
487 |
+
"sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
|
488 |
+
"sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
|
489 |
+
"sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
|
490 |
+
"sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
|
491 |
+
"sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
|
492 |
+
"sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
|
493 |
+
"sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
|
494 |
+
"sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
|
495 |
+
"sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
|
496 |
+
"sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
|
497 |
+
"sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
|
498 |
+
"sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
|
499 |
+
"sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
|
500 |
+
"sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
|
501 |
+
"sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
|
502 |
+
"sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
|
503 |
],
|
504 |
"markers": "python_version >= '3.10'",
|
505 |
+
"version": "==1.24.2"
|
506 |
},
|
507 |
"packaging": {
|
508 |
"hashes": [
|
|
|
549 |
},
|
550 |
"phonenumbers": {
|
551 |
"hashes": [
|
552 |
+
"sha256:1531b42c8c49a1f06b08598441bf1f11fe2618f707c6fc96b581b44aa4f2b0e3",
|
553 |
+
"sha256:f8bd92975ba7463b7828ae2f95e1037b7e0ab8f023e9e8ffb7c560fd7f5d66d7"
|
554 |
],
|
555 |
+
"version": "==8.13.6"
|
556 |
},
|
557 |
"pillow": {
|
558 |
"hashes": [
|
|
|
744 |
"markers": "python_version >= '3.7'",
|
745 |
"version": "==11.0.0"
|
746 |
},
|
747 |
+
"pycountry": {
|
748 |
+
"hashes": [
|
749 |
+
"sha256:b2163a246c585894d808f18783e19137cb70a0c18fb36748dc01fc6f109c1646"
|
750 |
+
],
|
751 |
+
"markers": "python_version >= '3.6' and python_version < '4'",
|
752 |
+
"version": "==22.3.5"
|
753 |
+
},
|
754 |
"pycryptodome": {
|
755 |
"hashes": [
|
756 |
"sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
|
|
|
1084 |
},
|
1085 |
"setuptools": {
|
1086 |
"hashes": [
|
1087 |
+
"sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c",
|
1088 |
+
"sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"
|
1089 |
],
|
1090 |
"markers": "python_version >= '3.7'",
|
1091 |
+
"version": "==67.2.0"
|
1092 |
},
|
1093 |
"six": {
|
1094 |
"hashes": [
|
|
|
1494 |
},
|
1495 |
"zipp": {
|
1496 |
"hashes": [
|
1497 |
+
"sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6",
|
1498 |
+
"sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b"
|
1499 |
],
|
1500 |
"markers": "python_version >= '3.7'",
|
1501 |
+
"version": "==3.13.0"
|
1502 |
}
|
1503 |
},
|
1504 |
"develop": {}
|
data/ascii_fb_names_small.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:baf5cf2fa43dc172c613f72793641f668e33c30b4e23932616de36cc0ce3447d
|
3 |
+
size 33601747
|
match_replace.py
CHANGED
@@ -1,69 +1,31 @@
|
|
1 |
import pandas as pd
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
while(surrogate_name == original_name):
|
30 |
-
# situation when gender can be matched
|
31 |
-
if not gender:
|
32 |
-
gender_df = fb_df[fb_df["gender"] == gender]
|
33 |
-
gender_c_df = gender_df[gender_df["country"] == country_code]
|
34 |
-
# situations: whether country code can be matched
|
35 |
-
if gender_c_df.shape[0] > 0:
|
36 |
-
surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
|
37 |
-
# if gender match, country not match: randomly return from gender df
|
38 |
-
else:
|
39 |
-
surrogate_name = gender_df[f_l].sample(n=1).to_string()
|
40 |
-
else:
|
41 |
-
# situation when gender cannot be match: gender is None
|
42 |
-
country_df = fb_df[fb_df["country"] == country_code]
|
43 |
-
# situation when country can be matched
|
44 |
-
if country_df.shape[0] > 0:
|
45 |
-
surrogate_name = country_df[f_l].sample(n=1).to_string()
|
46 |
-
# situation when neither gender nor country can be matched
|
47 |
-
# randomly return one name from the whole dataset
|
48 |
-
else:
|
49 |
-
surrogate_name = fb_df[f_l].sample(n=1).to_string()
|
50 |
-
|
51 |
-
return surrogate_name
|
52 |
-
|
53 |
-
def match_entity(original_info, entity):
|
54 |
-
# TODO: need refinement for each kind of entity
|
55 |
-
if entity == 'STUDENT':
|
56 |
-
# TODO: here, change between 1 and 2
|
57 |
-
return match_name_2(original_info)
|
58 |
-
elif entity == 'EMAIL_ADDRESS':
|
59 |
-
return '[email protected]'
|
60 |
-
elif entity == 'PHONE_NUMBER':
|
61 |
-
#TODO: specific form of number will be returned for consistency
|
62 |
-
return '000-000-0000'
|
63 |
-
elif entity == 'URL':
|
64 |
-
return 'google.com'
|
65 |
-
else:
|
66 |
-
pass
|
67 |
|
68 |
def match_name(original_name):
|
69 |
# FIXME: take too LONG time to run (large df used multi-times), how to improve
|
@@ -74,7 +36,6 @@ def match_name(original_name):
|
|
74 |
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
|
75 |
first_name = original_name.split()[0]
|
76 |
global fb_df
|
77 |
-
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
|
78 |
names = fb_df[fb_df['first']==first_name]
|
79 |
if not names.empty:
|
80 |
name_df = names.sample(n=1)
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
+
from names_database import NameDatabase
|
4 |
+
|
5 |
+
names_db = NameDatabase
|
6 |
+
|
7 |
+
def describe_name(first_names, last_names):
|
8 |
+
gender = names_db.get_gender() if first_names else None
|
9 |
+
country = names_db.get_country() if last_names else None
|
10 |
+
return gender, country
|
11 |
+
|
12 |
+
def split_name(all_names):
|
13 |
+
'''Splits name into parts.
|
14 |
+
If one token, assume it is a first name.
|
15 |
+
If two tokens, first and last name.
|
16 |
+
If three tokens, one first name and two last names.
|
17 |
+
If four tokens, two first names and two last names.'''
|
18 |
+
match all_names.split():
|
19 |
+
case [first]:
|
20 |
+
return first, None
|
21 |
+
case [first, last]:
|
22 |
+
return first, last
|
23 |
+
case [first, last_1, last_2]:
|
24 |
+
return first, ' '.join((last_1, last_2))
|
25 |
+
case [first_1, first_2, last_1, last_2]:
|
26 |
+
return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
|
27 |
+
case _:
|
28 |
+
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def match_name(original_name):
|
31 |
# FIXME: take too LONG time to run (large df used multi-times), how to improve
|
|
|
36 |
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
|
37 |
first_name = original_name.split()[0]
|
38 |
global fb_df
|
|
|
39 |
names = fb_df[fb_df['first']==first_name]
|
40 |
if not names.empty:
|
41 |
name_df = names.sample(n=1)
|
names_database.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from names_dataset import NameDataset, NameWrapper
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
class NameDatabase(NameDataset):
|
5 |
+
def __init__(self) -> None:
|
6 |
+
super().__init__()
|
7 |
+
self.names = pd.read_parquet('ascii_fb_names_small.parquet')
|
8 |
+
|
9 |
+
def get_random_name(
|
10 |
+
self,
|
11 |
+
country: Optional[str] = None,
|
12 |
+
gender: Optional[str] = None
|
13 |
+
):
|
14 |
+
'''country: ISO country code in 'alpha 2' format
|
15 |
+
gender: "M" or "F"
|
16 |
+
'''
|
17 |
+
names_view = self.names
|
18 |
+
if country:
|
19 |
+
names_view = names_view[names_view['country'] == country]
|
20 |
+
if gender:
|
21 |
+
names_view = names_view[names_view['gender'] == gender]
|
22 |
+
return names_view.sample(weights=names_view.count)
|
23 |
+
|
24 |
+
def get_gender(first_names: str):
|
25 |
+
return NameWrapper(self.search(first_names)).gender
|
26 |
+
|
27 |
+
def get_country(last_names: str):
|
28 |
+
return NameWrapper(self.search(last_names)).country
|