Spaces:
Running
Running
merge adjacent subtokens during anonymization
Browse files
app.py
CHANGED
@@ -225,6 +225,24 @@ def color_substrings(input_string, model_output):
|
|
225 |
|
226 |
# Selectively anonymize entities
|
227 |
def anonymize_text(input_string, model_output, selected_entities=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
anonymized_text = ""
|
229 |
last_end = 0
|
230 |
colors = generate_colors(len(ner_labels))
|
@@ -232,7 +250,7 @@ def anonymize_text(input_string, model_output, selected_entities=None):
|
|
232 |
label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
|
233 |
}
|
234 |
|
235 |
-
for entity in
|
236 |
start, end, label = entity["start"], entity["end"], entity["label"]
|
237 |
anonymized_text += input_string[last_end:start]
|
238 |
if selected_entities is None or label in selected_entities:
|
|
|
225 |
|
226 |
# Selectively anonymize entities
|
227 |
def anonymize_text(input_string, model_output, selected_entities=None):
|
228 |
+
merged_model_output = []
|
229 |
+
sorted_entities = sorted(model_output, key=lambda x: x["start"])
|
230 |
+
if sorted_entities:
|
231 |
+
current = sorted_entities[0]
|
232 |
+
for entity in sorted_entities[1:]:
|
233 |
+
if (
|
234 |
+
entity["label"] == current["label"]
|
235 |
+
and input_string[current["end"] : entity["start"]].strip() == ""
|
236 |
+
):
|
237 |
+
current["end"] = entity["end"]
|
238 |
+
current["word"] = input_string[current["start"] : current["end"]]
|
239 |
+
else:
|
240 |
+
merged_model_output.append(current)
|
241 |
+
current = entity
|
242 |
+
merged_model_output.append(current)
|
243 |
+
else:
|
244 |
+
merged_model_output = sorted_entities
|
245 |
+
|
246 |
anonymized_text = ""
|
247 |
last_end = 0
|
248 |
colors = generate_colors(len(ner_labels))
|
|
|
250 |
label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
|
251 |
}
|
252 |
|
253 |
+
for entity in merged_model_output:
|
254 |
start, end, label = entity["start"], entity["end"], entity["label"]
|
255 |
anonymized_text += input_string[last_end:start]
|
256 |
if selected_entities is None or label in selected_entities:
|