File size: 4,143 Bytes
8739181
 
 
 
 
 
a188b38
8739181
a188b38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8739181
 
43e2fd9
8739181
43e2fd9
3f556fb
 
 
 
3b3110b
 
 
 
 
 
 
3f556fb
 
 
 
 
8739181
 
3f556fb
 
 
 
 
 
 
 
 
 
 
 
3b3110b
 
 
3f556fb
8739181
 
 
 
43e2fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
8739181
3f556fb
8739181
3f556fb
 
a188b38
 
 
3b3110b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text


# Show highlighted ner entities in a tweet
def display_ner(example):
    ner_output = example["ner_output"]
    chunks = []
    current_chunk = ""
    current_type = None

    # Check if there are two labels repeated
    previous_label = None

    for label in ner_output["labels"]:
        if (
            label
            and previous_label
            and previous_label == label
            and label != "O"
            and not label.startswith("I-")
            and not label.startswith("B-")
        ):
            pass
        previous_label = label

    for token, label in zip(ner_output["tokens"], ner_output["labels"]):
        if label is None:
            # Perhaps it is too long
            continue
        if label == "O":
            if current_type is not None:
                # Add previous entity
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = None
            else:
                current_chunk += token + " "
                current_type = None
        elif label.startswith("B-"):
            if current_chunk:
                chunks.append((current_chunk.strip(), current_type))
            current_chunk = token + " "
            current_type = label[2:]
        elif label.startswith("I-"):
            current_chunk += token + " "
            current_type = label[2:]
        else:
            # It doesn't start with B- or I- => add single token
            if label != current_type:
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = label
            else:
                current_chunk += token + " "
                current_type = label

    if current_chunk:
        chunks.append((current_chunk.strip(), current_type))

    # Display text
    chunks = [(c, t) if t is not None else c for c, t in chunks]
    annotated_text(*chunks)


def display_text(example, text_column):
    # Use annotated_text to show entities
    text = example[text_column]

    # Sort entities by start
    entities = sorted(example["entities"], key=lambda x: x["start"])

    for entity in entities:
        entity_text = entity["text"]
        # find in text
        start = text.find(entity_text)
        end = start + len(entity_text)
        entity["start"] = start
        entity["end"] = end
    # Chunk text

    if len(entities) == 0:
        annotated_text(*[text])
        return

    chunks = []
    last_index = 0
    for i in range(len(entities)):
        entity = entities[i]
        start, end = entity["start"], entity["end"]

        if last_index < start:
            chunk_before_entity = text[last_index : entity["start"]]
            chunks.append((chunk_before_entity, None))
        chunks.append((entity["text"], entity["type"]))

        last_index = end

    if last_index < len(text):
        chunks.append((text[last_index:], None))

    # description = entity["kg_result"]["detailedDescription"]["articleBody"]
    chunks = [(c, t) if t is not None else c for c, t in chunks]
    annotated_text(*chunks)


# selectbox to choose dataset

selected_dataset = st.sidebar.selectbox(
    "Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"]
)

# Load data
ds = load_dataset(f"hs-knowledge/{selected_dataset}")

text_column = {
    "hateval_enriched": "text",
    "sbf-enriched": "post",
    "hatecheck-enriched": "test_case",
}

elements = random.choices(range(len(ds["train"])), k=50)
ds["train"] = ds["train"].select(elements)

for ex in ds["train"]:
    # display_text(ex)
    st.markdown("---")
    display_ner(ex)
    with st.expander("Show entities"):
        for ent in ex["entities"]:
            entity_name = ent["text"]
            entity_type = ent["type"]
            entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
            st.write(f"{entity_name} ({entity_type}): {entity_description}")