Update README.md
Browse files
README.md
CHANGED
@@ -26,7 +26,92 @@ generator = pipeline(
|
|
26 |
tokenizer="cjber/reddit-ner-place_names",
|
27 |
)
|
28 |
|
29 |
-
generator(
|
30 |
"I live in Gothenburg, and long queues aside I definitely prefer the housing situation here compared to Edinburgh."
|
31 |
)
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
tokenizer="cjber/reddit-ner-place_names",
|
27 |
)
|
28 |
|
29 |
+
out = generator(
|
30 |
"I live in Gothenburg, and long queues aside I definitely prefer the housing situation here compared to Edinburgh."
|
31 |
)
|
32 |
+
|
33 |
+
entities = [item["word"] for item in out]
|
34 |
+
labels = [item["entity"] for item in out]
|
35 |
+
```
|
36 |
+
|
37 |
+
Label idx values are required for the following stages:
|
38 |
+
|
39 |
+
```python
|
40 |
+
class Label:
|
41 |
+
labels: dict[str, int] = {
|
42 |
+
"O": 0,
|
43 |
+
"B-location": 1,
|
44 |
+
"I-location": 2,
|
45 |
+
"L-location": 3,
|
46 |
+
"U-location": 4,
|
47 |
+
}
|
48 |
+
|
49 |
+
idx: dict[int, str] = {v: k for k, v in labels.items()}
|
50 |
+
count: int = len(labels)
|
51 |
+
```
|
52 |
+
|
53 |
+
Combine subwords:
|
54 |
+
|
55 |
+
```python
|
56 |
+
def combine_subwords(tokens: list[str], tags: list[int]) -> tuple[list[str], list[str]]:
|
57 |
+
idx = [
|
58 |
+
idx for idx, token in enumerate(tokens) if token not in ["<s>", "<pad>", "</s>"]
|
59 |
+
]
|
60 |
+
|
61 |
+
tokens = [tokens[i] for i in idx]
|
62 |
+
tags = [tags[i] for i in idx]
|
63 |
+
|
64 |
+
for idx, _ in enumerate(tokens):
|
65 |
+
idx += 1
|
66 |
+
if not tokens[-idx + 1].startswith("Ġ"):
|
67 |
+
tokens[-idx] = tokens[-idx] + tokens[-idx + 1]
|
68 |
+
subwords = [i for i, _ in enumerate(tokens) if tokens[i].startswith("Ġ")]
|
69 |
+
|
70 |
+
tags = [tags[i] for i in subwords]
|
71 |
+
tokens = [tokens[i][1:] for i in subwords]
|
72 |
+
tags_str: list[str] = [Label.idx[i] for i in tags]
|
73 |
+
return tokens, tags_str
|
74 |
+
|
75 |
+
|
76 |
+
names, labels = combine_subwords(entities, [Label.labels[lb] for lb in labels])
|
77 |
+
```
|
78 |
+
|
79 |
+
Combine BILUO tags:
|
80 |
+
|
81 |
+
```python
|
82 |
+
def combine_biluo(tokens: list[str], tags: list[str]) -> tuple[list[str], list[str]]:
|
83 |
+
tokens_biluo = tokens.copy()
|
84 |
+
tags_biluo = tags.copy()
|
85 |
+
|
86 |
+
for idx, tag in enumerate(tags_biluo):
|
87 |
+
if idx + 1 < len(tags_biluo) and tag[0] == "B":
|
88 |
+
i = 1
|
89 |
+
while tags_biluo[idx + i][0] not in ["B", "O"]:
|
90 |
+
tokens_biluo[idx] = f"{tokens_biluo[idx]} {tokens_biluo[idx + i]}"
|
91 |
+
i += 1
|
92 |
+
if idx + i == len(tokens_biluo):
|
93 |
+
break
|
94 |
+
|
95 |
+
zipped = [
|
96 |
+
(token, tag)
|
97 |
+
for (token, tag) in zip(tokens_biluo, tags_biluo)
|
98 |
+
if tag[0] not in ["I", "L"]
|
99 |
+
]
|
100 |
+
if list(zipped):
|
101 |
+
tokens_biluo, tags_biluo = zip(*zipped)
|
102 |
+
tags_biluo = [tag[2:] if tag != "O" else tag for tag in tags_biluo]
|
103 |
+
return list(tokens_biluo), tags_biluo
|
104 |
+
else:
|
105 |
+
return [], []
|
106 |
+
|
107 |
+
names, labels = combine_biluo(names, labels)
|
108 |
+
```
|
109 |
+
|
110 |
+
This gives:
|
111 |
+
|
112 |
+
```python
|
113 |
+
>>> names
|
114 |
+
['Gothenburg', 'Edinburgh']
|
115 |
+
|
116 |
+
>>> labels
|
117 |
+
['location', 'location']
|