finiteautomata commited on
Commit
a188b38
·
1 Parent(s): 3b3110b

Reuse NER stuff

Browse files
Files changed (1) hide show
  1. app.py +62 -2
app.py CHANGED
@@ -7,7 +7,66 @@ from annotated_text import annotated_text
7
  # Load data
8
  ds = load_dataset("hs-knowledge/hateval_enriched")
9
 
 
10
  # Show highlighted ner entities in a tweet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  def display_text(example):
@@ -57,8 +116,9 @@ elements = random.choices(range(len(ds["train"])), k=50)
57
  ds["train"] = ds["train"].select(elements)
58
 
59
  for ex in ds["train"]:
60
- st.write("=" * 80)
61
- display_text(ex)
 
62
  with st.expander("Show entities"):
63
  for ent in ex["entities"]:
64
  entity_name = ent["text"]
 
7
  # Load data
8
  ds = load_dataset("hs-knowledge/hateval_enriched")
9
 
10
+
11
  # Show highlighted ner entities in a tweet
12
+ def display_ner(example):
13
+ ner_output = example["ner_output"]
14
+ chunks = []
15
+ current_chunk = ""
16
+ current_type = None
17
+
18
+ # Check if there are two labels repeated
19
+ previous_label = None
20
+
21
+ for label in ner_output["labels"]:
22
+ if (
23
+ label
24
+ and previous_label
25
+ and previous_label == label
26
+ and label != "O"
27
+ and not label.startswith("I-")
28
+ and not label.startswith("B-")
29
+ ):
30
+ pass
31
+ previous_label = label
32
+
33
+ for token, label in zip(ner_output["tokens"], ner_output["labels"]):
34
+ if label is None:
35
+ # Perhaps it is too long
36
+ continue
37
+ if label == "O":
38
+ if current_type is not None:
39
+ # Add previous entity
40
+ chunks.append((current_chunk.strip(), current_type))
41
+ current_chunk = token + " "
42
+ current_type = None
43
+ else:
44
+ current_chunk += token + " "
45
+ current_type = None
46
+ elif label.startswith("B-"):
47
+ if current_chunk:
48
+ chunks.append((current_chunk.strip(), current_type))
49
+ current_chunk = token + " "
50
+ current_type = label[2:]
51
+ elif label.startswith("I-"):
52
+ current_chunk += token + " "
53
+ current_type = label[2:]
54
+ else:
55
+ # It doesn't start with B- or I- => add single token
56
+ if label != current_type:
57
+ chunks.append((current_chunk.strip(), current_type))
58
+ current_chunk = token + " "
59
+ current_type = label
60
+ else:
61
+ current_chunk += token + " "
62
+ current_type = label
63
+
64
+ if current_chunk:
65
+ chunks.append((current_chunk.strip(), current_type))
66
+
67
+ # Display text
68
+ chunks = [(c, t) if t is not None else c for c, t in chunks]
69
+ annotated_text(*chunks)
70
 
71
 
72
  def display_text(example):
 
116
  ds["train"] = ds["train"].select(elements)
117
 
118
  for ex in ds["train"]:
119
+ # display_text(ex)
120
+ st.markdown("---")
121
+ display_ner(ex)
122
  with st.expander("Show entities"):
123
  for ent in ex["entities"]:
124
  entity_name = ent["text"]