AI-Enthusiast11 commited on
Commit
3ec3718
·
verified ·
1 Parent(s): 3e9b998

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +55 -4
README.md CHANGED
@@ -83,6 +83,7 @@ Evaluation was done on a held-out portion of the same labeled dataset.
83
  ## How to Get Started with the Model
84
 
85
  ```python
 
86
  from transformers import AutoTokenizer, AutoModelForTokenClassification
87
  from transformers import pipeline
88
 
@@ -90,7 +91,57 @@ model_name = "AI-Enthusiast11/pii-entity-extractor"
90
  tokenizer = AutoTokenizer.from_pretrained(model_name)
91
  model = AutoModelForTokenClassification.from_pretrained(model_name)
92
 
93
- nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
94
- text = "My name is John Smith and my SSN is 123-45-6789."
95
- results = nlp(text)
96
- print(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  ## How to Get Started with the Model
84
 
85
  ```python
86
+
87
  from transformers import AutoTokenizer, AutoModelForTokenClassification
88
  from transformers import pipeline
89
 
 
91
  tokenizer = AutoTokenizer.from_pretrained(model_name)
92
  model = AutoModelForTokenClassification.from_pretrained(model_name)
93
 
94
+ # Post processing logic to combine the subword tokens
95
+ def merge_tokens(ner_results):
96
+ entities = {}
97
+ for entity in ner_results:
98
+ entity_type = entity["entity_group"]
99
+ entity_value = entity["word"].replace("##", "") # Remove subword prefixes
100
+
101
+ # Handle token merging
102
+ if entity_type not in entities:
103
+ entities[entity_type] = []
104
+ if entities[entity_type] and not entity_value.startswith(" "):
105
+ # If the previous token exists and this one isn't a new word, merge it
106
+ entities[entity_type][-1] += entity_value
107
+ else:
108
+ entities[entity_type].append(entity_value)
109
+
110
+ return entities
111
+
112
+ def redact_text_with_labels(text):
113
+ ner_results = nlp(text)
114
+
115
+ # Merge tokens for multi-token entities (if any)
116
+ cleaned_entities = merge_tokens(ner_results)
117
+
118
+ redacted_text = text
119
+ for entity_type, values in cleaned_entities.items():
120
+ for value in values:
121
+ # Replace each identified entity with the label
122
+ redacted_text = redacted_text.replace(value, f"[{entity_type}]")
123
+
124
+ return redacted_text
125
+
126
+
127
+
128
+ #Loading the pipeline
129
+ nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer, aggregation_strategy="simple")
130
+
131
+ # Example input (choose one from your examples)
132
+ example = "Hi, I’m Mia Thompson. I recently noticed that my electricity bill hasn’t been updated despite making the payment last week. I used account number 4893172051 linked with routing number 192847561. My service was nearly suspended, and I’d appreciate it if you could verify the payment. You can reach me at 727-814-3902 if more information is needed."
133
+
134
+ # Run pipeline and process result
135
+ ner_results = nlp(example)
136
+ cleaned_entities = merge_tokens(ner_results)
137
+
138
+ # Print the NER results
139
+ print("\n==NER Results:==\n")
140
+ for entity_type, values in cleaned_entities.items():
141
+ print(f" {entity_type}: {', '.join(values)}")
142
+
143
+ # Redact the single example with labels
144
+ redacted_example = redact_text_with_labels(example)
145
+
146
+ # Print the redacted result
147
+ print(f"\n==Redacted Example:==\n{redacted_example}")