Update README.md
Browse files
README.md
CHANGED
@@ -83,6 +83,7 @@ Evaluation was done on a held-out portion of the same labeled dataset.
|
|
83 |
## How to Get Started with the Model
|
84 |
|
85 |
```python
|
|
|
86 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
87 |
from transformers import pipeline
|
88 |
|
@@ -90,7 +91,57 @@ model_name = "AI-Enthusiast11/pii-entity-extractor"
|
|
90 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
91 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
## How to Get Started with the Model
|
84 |
|
85 |
```python
|
86 |
+
|
87 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
88 |
from transformers import pipeline
|
89 |
|
|
|
91 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
92 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
93 |
|
94 |
+
# Post processing logic to combine the subword tokens
|
95 |
+
def merge_tokens(ner_results):
|
96 |
+
entities = {}
|
97 |
+
for entity in ner_results:
|
98 |
+
entity_type = entity["entity_group"]
|
99 |
+
entity_value = entity["word"].replace("##", "") # Remove subword prefixes
|
100 |
+
|
101 |
+
# Handle token merging
|
102 |
+
if entity_type not in entities:
|
103 |
+
entities[entity_type] = []
|
104 |
+
if entities[entity_type] and not entity_value.startswith(" "):
|
105 |
+
# If the previous token exists and this one isn't a new word, merge it
|
106 |
+
entities[entity_type][-1] += entity_value
|
107 |
+
else:
|
108 |
+
entities[entity_type].append(entity_value)
|
109 |
+
|
110 |
+
return entities
|
111 |
+
|
112 |
+
def redact_text_with_labels(text):
|
113 |
+
ner_results = nlp(text)
|
114 |
+
|
115 |
+
# Merge tokens for multi-token entities (if any)
|
116 |
+
cleaned_entities = merge_tokens(ner_results)
|
117 |
+
|
118 |
+
redacted_text = text
|
119 |
+
for entity_type, values in cleaned_entities.items():
|
120 |
+
for value in values:
|
121 |
+
# Replace each identified entity with the label
|
122 |
+
redacted_text = redacted_text.replace(value, f"[{entity_type}]")
|
123 |
+
|
124 |
+
return redacted_text
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
#Loading the pipeline
|
129 |
+
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer, aggregation_strategy="simple")
|
130 |
+
|
131 |
+
# Example input (choose one from your examples)
|
132 |
+
example = "Hi, I’m Mia Thompson. I recently noticed that my electricity bill hasn’t been updated despite making the payment last week. I used account number 4893172051 linked with routing number 192847561. My service was nearly suspended, and I’d appreciate it if you could verify the payment. You can reach me at 727-814-3902 if more information is needed."
|
133 |
+
|
134 |
+
# Run pipeline and process result
|
135 |
+
ner_results = nlp(example)
|
136 |
+
cleaned_entities = merge_tokens(ner_results)
|
137 |
+
|
138 |
+
# Print the NER results
|
139 |
+
print("\n==NER Results:==\n")
|
140 |
+
for entity_type, values in cleaned_entities.items():
|
141 |
+
print(f" {entity_type}: {', '.join(values)}")
|
142 |
+
|
143 |
+
# Redact the single example with labels
|
144 |
+
redacted_example = redact_text_with_labels(example)
|
145 |
+
|
146 |
+
# Print the redacted result
|
147 |
+
print(f"\n==Redacted Example:==\n{redacted_example}")
|