File size: 2,884 Bytes
c769b40 dd41067 c769b40 c1fb75b c769b40 6337b65 eef9733 6337b65 df1521a 6337b65 df1521a ed15c9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
---
language: en
datasets:
- wnut_17
license: mit
metrics:
- f1
widget:
- text: "My name is Sylvain and I live in Paris"
example_title: "Parisian"
- text: "My name is Sarah and I live in London"
example_title: "Londoner"
---
# Reddit NER for place names
## Use in `transformers`
```python
from transformers import pipeline
generator = pipeline(
task="ner",
model="cjber/reddit-ner-place_names",
tokenizer="cjber/reddit-ner-place_names",
)
out = generator(
"I live in Gothenburg, and long queues aside I definitely prefer the housing situation here compared to Edinburgh."
)
entities = [item["word"] for item in out]
labels = [item["entity"] for item in out]
```
Label idx values are required for the following stages:
```python
class Label:
labels: dict[str, int] = {
"O": 0,
"B-location": 1,
"I-location": 2,
"L-location": 3,
"U-location": 4,
}
idx: dict[int, str] = {v: k for k, v in labels.items()}
count: int = len(labels)
```
Combine subwords:
```python
def combine_subwords(tokens: list[str], tags: list[int]) -> tuple[list[str], list[str]]:
idx = [
idx for idx, token in enumerate(tokens) if token not in ["<s>", "<pad>", "</s>"]
]
tokens = [tokens[i] for i in idx]
tags = [tags[i] for i in idx]
for idx, _ in enumerate(tokens):
idx += 1
if not tokens[-idx + 1].startswith("Ġ"):
tokens[-idx] = tokens[-idx] + tokens[-idx + 1]
subwords = [i for i, _ in enumerate(tokens) if tokens[i].startswith("Ġ")]
tags = [tags[i] for i in subwords]
tokens = [tokens[i][1:] for i in subwords]
tags_str: list[str] = [Label.idx[i] for i in tags]
return tokens, tags_str
names, labels = combine_subwords(entities, [Label.labels[lb] for lb in labels])
```
Combine BILUO tags:
```python
def combine_biluo(tokens: list[str], tags: list[str]) -> tuple[list[str], list[str]]:
tokens_biluo = tokens.copy()
tags_biluo = tags.copy()
for idx, tag in enumerate(tags_biluo):
if idx + 1 < len(tags_biluo) and tag[0] == "B":
i = 1
while tags_biluo[idx + i][0] not in ["B", "O"]:
tokens_biluo[idx] = f"{tokens_biluo[idx]} {tokens_biluo[idx + i]}"
i += 1
if idx + i == len(tokens_biluo):
break
zipped = [
(token, tag)
for (token, tag) in zip(tokens_biluo, tags_biluo)
if tag[0] not in ["I", "L"]
]
if list(zipped):
tokens_biluo, tags_biluo = zip(*zipped)
tags_biluo = [tag[2:] if tag != "O" else tag for tag in tags_biluo]
return list(tokens_biluo), tags_biluo
else:
return [], []
names, labels = combine_biluo(names, labels)
```
This gives:
```python
>>> names
['Gothenburg', 'Edinburgh']
>>> labels
['location', 'location']
``` |