harshildarji commited on
Commit
85ed894
·
verified ·
1 Parent(s): 6fa58f0

merge adjacent subtokens during anonymization

Browse files
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -225,6 +225,24 @@ def color_substrings(input_string, model_output):
225
 
226
  # Selectively anonymize entities
227
  def anonymize_text(input_string, model_output, selected_entities=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  anonymized_text = ""
229
  last_end = 0
230
  colors = generate_colors(len(ner_labels))
@@ -232,7 +250,7 @@ def anonymize_text(input_string, model_output, selected_entities=None):
232
  label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
233
  }
234
 
235
- for entity in sorted(model_output, key=lambda x: x["start"]):
236
  start, end, label = entity["start"], entity["end"], entity["label"]
237
  anonymized_text += input_string[last_end:start]
238
  if selected_entities is None or label in selected_entities:
 
225
 
226
  # Selectively anonymize entities
227
  def anonymize_text(input_string, model_output, selected_entities=None):
228
+ merged_model_output = []
229
+ sorted_entities = sorted(model_output, key=lambda x: x["start"])
230
+ if sorted_entities:
231
+ current = sorted_entities[0]
232
+ for entity in sorted_entities[1:]:
233
+ if (
234
+ entity["label"] == current["label"]
235
+ and input_string[current["end"] : entity["start"]].strip() == ""
236
+ ):
237
+ current["end"] = entity["end"]
238
+ current["word"] = input_string[current["start"] : current["end"]]
239
+ else:
240
+ merged_model_output.append(current)
241
+ current = entity
242
+ merged_model_output.append(current)
243
+ else:
244
+ merged_model_output = sorted_entities
245
+
246
  anonymized_text = ""
247
  last_end = 0
248
  colors = generate_colors(len(ner_labels))
 
250
  label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
251
  }
252
 
253
+ for entity in merged_model_output:
254
  start, end, label = entity["start"], entity["end"], entity["label"]
255
  anonymized_text += input_string[last_end:start]
256
  if selected_entities is None or label in selected_entities: