MrPotato commited on
Commit
8ce7de5
·
1 Parent(s): 8c96438

commit files to HF hub

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "MrPotato/reference-segmentation-xlm-roberta-geocite-v2",
3
+ "alpha": 0.5,
4
+ "architectures": [
5
+ "XLMRobertaForReferenceSegmentation"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_refseg.XLMRobertaRefSegConfig",
10
+ "AutoModelForTokenClassification": "modeling_refseg.XLMRobertaForReferenceSegmentation"
11
+ },
12
+ "bos_token_id": 0,
13
+ "classifier_dropout": null,
14
+ "custom_pipelines": {
15
+ "ref-seg": {
16
+ "impl": "ref_seg.RefSegPipeline",
17
+ "pt": [
18
+ "AutoModelForTokenClassification"
19
+ ],
20
+ "tf": [
21
+ "TFAutoModelForTokenClassification"
22
+ ]
23
+ }
24
+ },
25
+ "eos_token_id": 2,
26
+ "hidden_act": "gelu",
27
+ "hidden_dropout_prob": 0.1,
28
+ "hidden_size": 768,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 3072,
31
+ "layer_norm_eps": 1e-12,
32
+ "max_position_embeddings": 514,
33
+ "model_type": "xlm-roberta",
34
+ "num_attention_heads": 12,
35
+ "num_hidden_layers": 12,
36
+ "num_labels_first": 27,
37
+ "num_labels_second": 2,
38
+ "pad_token_id": 1,
39
+ "position_embedding_type": "absolute",
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.25.1",
42
+ "type_vocab_size": 1,
43
+ "use_cache": true,
44
+ "vocab_size": 250002
45
+ }
configuration_refseg.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class XLMRobertaRefSegConfig(PretrainedConfig):
5
+ r"""
6
+ This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
7
+ is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
8
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
9
+ [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) architecture.
10
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
11
+ documentation from [`PretrainedConfig`] for more information.
12
+ Args:
13
+ vocab_size (`int`, *optional*, defaults to 30522):
14
+ Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
15
+ the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
16
+ hidden_size (`int`, *optional*, defaults to 768):
17
+ Dimensionality of the encoder layers and the pooler layer.
18
+ num_hidden_layers (`int`, *optional*, defaults to 12):
19
+ Number of hidden layers in the Transformer encoder.
20
+ num_attention_heads (`int`, *optional*, defaults to 12):
21
+ Number of attention heads for each attention layer in the Transformer encoder.
22
+ intermediate_size (`int`, *optional*, defaults to 3072):
23
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
24
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
25
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
26
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
27
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
28
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
29
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
30
+ The dropout ratio for the attention probabilities.
31
+ max_position_embeddings (`int`, *optional*, defaults to 512):
32
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
33
+ just in case (e.g., 512 or 1024 or 2048).
34
+ type_vocab_size (`int`, *optional*, defaults to 2):
35
+ The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
36
+ [`TFXLMRobertaModel`].
37
+ initializer_range (`float`, *optional*, defaults to 0.02):
38
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
39
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
40
+ The epsilon used by the layer normalization layers.
41
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
42
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
43
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
44
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
45
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
46
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
47
+ is_decoder (`bool`, *optional*, defaults to `False`):
48
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
49
+ use_cache (`bool`, *optional*, defaults to `True`):
50
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
51
+ relevant if `config.is_decoder=True`.
52
+ classifier_dropout (`float`, *optional*):
53
+ The dropout ratio for the classification head.
54
+ Examples:
55
+ ```python
56
+ >>> from transformers import XLMRobertaConfig, XLMRobertaModel
57
+ >>> # Initializing a XLM-RoBERTa xlm-roberta-base style configuration
58
+ >>> configuration = XLMRobertaConfig()
59
+ >>> # Initializing a model (with random weights) from the xlm-roberta-base style configuration
60
+ >>> model = XLMRobertaModel(configuration)
61
+ >>> # Accessing the model configuration
62
+ >>> configuration = model.config
63
+ ```"""
64
+ model_type = "xlm-roberta"
65
+
66
+ def __init__(
67
+ self,
68
+ vocab_size=250002,
69
+ hidden_size=768,
70
+ num_hidden_layers=12,
71
+ num_attention_heads=12,
72
+ intermediate_size=3072,
73
+ hidden_act="gelu",
74
+ hidden_dropout_prob=0.1,
75
+ attention_probs_dropout_prob=0.1,
76
+ max_position_embeddings=514,
77
+ type_vocab_size=1,
78
+ initializer_range=0.02,
79
+ layer_norm_eps=1e-12,
80
+ pad_token_id=1,
81
+ bos_token_id=0,
82
+ eos_token_id=2,
83
+ position_embedding_type="absolute",
84
+ use_cache=True,
85
+ classifier_dropout=None,
86
+ num_labels_first=29,
87
+ num_labels_second=2,
88
+ alpha=1.0,
89
+ **kwargs
90
+ ):
91
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
92
+
93
+ self.vocab_size = vocab_size
94
+ self.hidden_size = hidden_size
95
+ self.num_hidden_layers = num_hidden_layers
96
+ self.num_attention_heads = num_attention_heads
97
+ self.hidden_act = hidden_act
98
+ self.intermediate_size = intermediate_size
99
+ self.hidden_dropout_prob = hidden_dropout_prob
100
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
101
+ self.max_position_embeddings = max_position_embeddings
102
+ self.type_vocab_size = type_vocab_size
103
+ self.initializer_range = initializer_range
104
+ self.layer_norm_eps = layer_norm_eps
105
+ self.position_embedding_type = position_embedding_type
106
+ self.use_cache = use_cache
107
+ self.classifier_dropout = classifier_dropout
108
+ self.num_labels_first = num_labels_first
109
+ self.num_labels_second = num_labels_second
110
+ self.alpha = alpha
111
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
modeling_refseg.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
2
+ from transformers.modeling_outputs import TokenClassifierOutput
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import CrossEntropyLoss
6
+ from typing import Optional, Tuple, Union
7
+
8
+
9
+ class XLMRobertaForReferenceSegmentation(XLMRobertaPreTrainedModel):
10
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
11
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
12
+
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+ self.num_labels_first = config.num_labels_first
16
+ self.num_labels_second = config.num_labels_second
17
+ self.alpha = config.alpha
18
+
19
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
20
+ classifier_dropout = (
21
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
22
+ )
23
+ self.dropout = nn.Dropout(classifier_dropout)
24
+ self.classifier_first = nn.Linear(config.hidden_size, self.num_labels_first)
25
+ self.classifier_second = nn.Linear(config.hidden_size, self.num_labels_second)
26
+
27
+ self.post_init()
28
+
29
+ def forward(
30
+ self,
31
+ input_ids: Optional[torch.LongTensor] = None,
32
+ attention_mask: Optional[torch.FloatTensor] = None,
33
+ token_type_ids: Optional[torch.LongTensor] = None,
34
+ position_ids: Optional[torch.LongTensor] = None,
35
+ head_mask: Optional[torch.FloatTensor] = None,
36
+ inputs_embeds: Optional[torch.FloatTensor] = None,
37
+ labels_first: Optional[torch.LongTensor] = None,
38
+ labels_second: Optional[torch.LongTensor] = None,
39
+ output_attentions: Optional[bool] = None,
40
+ output_hidden_states: Optional[bool] = None,
41
+ return_dict: Optional[bool] = None,
42
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
43
+ r"""
44
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
45
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
46
+ """
47
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
48
+
49
+ outputs = self.roberta(
50
+ input_ids,
51
+ attention_mask=attention_mask,
52
+ token_type_ids=token_type_ids,
53
+ position_ids=position_ids,
54
+ head_mask=head_mask,
55
+ inputs_embeds=inputs_embeds,
56
+ output_attentions=output_attentions,
57
+ output_hidden_states=output_hidden_states,
58
+ return_dict=return_dict,
59
+ )
60
+
61
+ sequence_output = outputs[0]
62
+
63
+ sequence_output_first = self.dropout(sequence_output)
64
+ logits_first = self.classifier_first(sequence_output_first)
65
+
66
+ sequence_output_second = self.dropout(sequence_output)
67
+ logits_second = self.classifier_second(sequence_output_second)
68
+
69
+ loss = None
70
+ if labels_first is not None and labels_second is not None:
71
+ loss_fct_first = CrossEntropyLoss()
72
+ loss_fct_second = CrossEntropyLoss()
73
+ loss_first = loss_fct_first(logits_first.view(-1, self.num_labels_first), labels_first.view(-1))
74
+ loss_second = loss_fct_second(logits_second.view(-1, self.num_labels_second), labels_second.view(-1))
75
+ loss = loss_first + (self.alpha * loss_second)
76
+
77
+ return TokenClassifierOutput(
78
+ loss=loss,
79
+ logits=[logits_first, logits_second],
80
+ hidden_states=outputs.hidden_states,
81
+ attentions=outputs.attentions,
82
+ )
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b084beb63e2011fd34eb19dde325fd75fa8d6141a80f68c6b98782e446526482
3
+ size 1109971957
ref_seg.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+ from typing import List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ from transformers import Pipeline
6
+
7
+
8
+ class RefSegPipeline(Pipeline):
9
+
10
+ labels = [
11
+ 'publisher', 'source', 'url', 'other', 'author', 'editor', 'lpage',
12
+ 'volume', 'year', 'issue', 'title', 'fpage', 'edition'
13
+ ]
14
+ iob_labels = list(chain.from_iterable([['B-' + x, 'I-' + x] for x in labels])) + ['O']
15
+ id2seg = {k: v for k, v in enumerate(iob_labels)}
16
+ id2ref = {k: v for k, v in enumerate(['B-ref', 'I-ref', ])}
17
+ is_split_into_words = False
18
+
19
+ def _sanitize_parameters(self, **kwargs):
20
+ if "id2seg" in kwargs:
21
+ self.id2seg = kwargs["id2seg"]
22
+ if "id2ref" in kwargs:
23
+ self.id2ref = kwargs["id2ref"]
24
+
25
+ return {}, {}, {}
26
+
27
+ def preprocess(self, sentence, offset_mapping=None, split_into_words=True):
28
+ tokens = sentence
29
+ if split_into_words:
30
+ split_sentence = self.tokenizer.pre_tokenizer.pre_tokenize_str(sentence)
31
+ tokens, offsets = zip(*split_sentence)
32
+ model_inputs = self.tokenizer(
33
+ tokens,
34
+ return_offsets_mapping=True,
35
+ padding='max_length',
36
+ truncation=True,
37
+ max_length=512,
38
+ return_tensors="pt",
39
+ return_special_tokens_mask=True,
40
+ return_overflowing_tokens=True,
41
+ is_split_into_words=split_into_words,
42
+ stride=32
43
+ )
44
+
45
+ if offset_mapping:
46
+ model_inputs["offset_mapping"] = offset_mapping
47
+
48
+ model_inputs["sentence"] = sentence
49
+ model_inputs["token_offsets"] = offsets
50
+
51
+ return model_inputs
52
+
53
+
54
+ def _forward(self, model_inputs):
55
+ special_tokens_mask = model_inputs.pop("special_tokens_mask")
56
+ offset_mapping = model_inputs.pop("offset_mapping", None)
57
+ sentence = model_inputs.pop("sentence")
58
+ token_offsets = model_inputs.pop("token_offsets")
59
+ overflow_mapping = model_inputs.pop("overflow_to_sample_mapping")
60
+ if self.framework == "tf":
61
+ logits = self.model(model_inputs.data)[0]
62
+ else:
63
+ logits = self.model(**model_inputs)[0]
64
+
65
+ return {
66
+ "logits": logits,
67
+ "special_tokens_mask": special_tokens_mask,
68
+ "offset_mapping": offset_mapping,
69
+ "overflow_mapping": overflow_mapping,
70
+ "sentence": sentence,
71
+ "token_offsets": token_offsets,
72
+ **model_inputs,
73
+ }
74
+
75
+ def postprocess(self, model_outputs):
76
+ # if ignore_labels is None:
77
+ ignore_labels = ["O"]
78
+ logits_seg = model_outputs["logits"][0].numpy()
79
+ logits_ref = model_outputs["logits"][1].numpy()
80
+ sentence = model_outputs["sentence"]
81
+ token_offsets = model_outputs["token_offsets"]
82
+ input_ids = model_outputs["input_ids"]
83
+ special_tokens_mask = model_outputs["special_tokens_mask"]
84
+
85
+ offset_mapping = model_outputs["offset_mapping"] if model_outputs["offset_mapping"] is not None else None
86
+
87
+ maxes_seg = np.max(logits_seg, axis=-1, keepdims=True)
88
+ shifted_exp_seg = np.exp(logits_seg - maxes_seg)
89
+ scores_seg = shifted_exp_seg / shifted_exp_seg.sum(axis=-1, keepdims=True)
90
+
91
+ maxes_ref = np.max(logits_ref, axis=-1, keepdims=True)
92
+ shifted_exp_ref = np.exp(logits_ref - maxes_ref)
93
+ scores_ref = shifted_exp_ref / shifted_exp_ref.sum(axis=-1, keepdims=True)
94
+
95
+ pre_entities = self.gather_pre_entities(
96
+ input_ids, scores_seg, scores_ref, offset_mapping, special_tokens_mask
97
+ )
98
+ grouped_entities = self.aggregate(pre_entities, token_offsets, sentence)
99
+
100
+ cleaned_groups = []
101
+ for group in grouped_entities:
102
+ start, end = None, None
103
+ entities = []
104
+ group_dict = {}
105
+ for entity in group:
106
+ if entity.get("entity_group", None) in ignore_labels:
107
+ continue
108
+ if start is None or end is None:
109
+ start = entity["start"]
110
+ end = entity["end"]
111
+ else:
112
+ start = min(start, entity["start"])
113
+ end = max(end, entity["end"])
114
+ entities.append(entity)
115
+ if entities:
116
+ group_dict["reference_raw"] = sentence[start:end]
117
+ group_dict["entities"] = entities
118
+ cleaned_groups.append(group_dict)
119
+
120
+ # entities = [
121
+ # entity
122
+ # for entity in group
123
+ # if entity.get("entity_group", None) not in ignore_labels
124
+ # ]
125
+ # if entities:
126
+ # cleaned_groups.append(entities)
127
+ return {
128
+ "number_of_references": len(cleaned_groups),
129
+ "references": cleaned_groups,
130
+ }
131
+
132
+ def gather_pre_entities(
133
+ self,
134
+ input_ids: np.ndarray,
135
+ scores_seg: np.ndarray,
136
+ scores_ref: np.ndarray,
137
+ offset_mappings: Optional[List[Tuple[int, int]]],
138
+ special_tokens_masks: np.ndarray,
139
+ ) -> List[dict]:
140
+ """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
141
+ pre_entities = []
142
+ for idx_list, (input_id, offset_mapping, special_tokens_mask, s_seg, s_ref) in enumerate(
143
+ zip(input_ids, offset_mappings, special_tokens_masks, scores_seg, scores_ref)):
144
+ for idx, iid in enumerate(input_id):
145
+ skip = False
146
+ if idx_list != 0 and idx <= 32:
147
+ skip = True
148
+
149
+ if special_tokens_mask[idx]:
150
+ continue
151
+
152
+ word = self.tokenizer.convert_ids_to_tokens(int(input_id[idx]))
153
+ if offset_mapping is not None:
154
+ start_ind, end_ind = offset_mapping[idx]
155
+ if not isinstance(start_ind, int):
156
+ if self.framework == "pt":
157
+ start_ind = start_ind.item()
158
+ end_ind = end_ind.item()
159
+
160
+ is_subword = not word.startswith('\u2581')
161
+
162
+ if int(input_id[idx]) == self.tokenizer.unk_token_id:
163
+ is_subword = False
164
+ else:
165
+ start_ind = None
166
+ end_ind = None
167
+ is_subword = False
168
+
169
+ pre_entity = {
170
+ "word": word,
171
+ "scores_seg": s_seg[idx],
172
+ "scores_ref": s_ref[idx],
173
+ "start": start_ind,
174
+ "end": end_ind,
175
+ "index": idx,
176
+ "is_subword": is_subword,
177
+ "is_stride": skip,
178
+ }
179
+ pre_entities.append(pre_entity)
180
+ return pre_entities
181
+
182
+ def aggregate(self, pre_entities: List[dict], token_offsets: List[tuple], sentence: str) -> List[dict]:
183
+ entities = self.aggregate_words(pre_entities, token_offsets)
184
+
185
+ return self.group_entities(entities, sentence)
186
+
187
+ def aggregate_word(self, entities: List[dict], token_offset: tuple) -> dict:
188
+ word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
189
+ scores_seg = entities[0]["scores_seg"]
190
+ idx_seg = scores_seg.argmax()
191
+ score_seg = scores_seg[idx_seg]
192
+ entity_seg = self.id2seg[idx_seg]
193
+
194
+ scores_ref = np.stack([entity["scores_ref"] for entity in entities])
195
+ indices_ref = scores_ref.argmax(axis=1)
196
+ idx_ref = 1 if all(indices_ref) else 0
197
+ entity_ref = self.id2ref[idx_ref]
198
+
199
+ new_entity = {
200
+ "entity_seg": entity_seg,
201
+ "score_seg": score_seg,
202
+ "entity_ref": entity_ref,
203
+ "word": word,
204
+ "start": entities[0]["start"] + token_offset[0],
205
+ "end": entities[-1]["end"] + token_offset[0],
206
+ }
207
+ return new_entity
208
+
209
+ def aggregate_words(self, entities: List[dict], token_offsets: List[tuple]) -> List[dict]:
210
+ """
211
+ Override tokens from a given word that disagree to force agreement on word boundaries.
212
+ Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
213
+ company| B-ENT I-ENT
214
+ """
215
+ word_entities = []
216
+ word_group = None
217
+ idx = 0
218
+ for entity in entities:
219
+ if entity["is_stride"]:
220
+ continue
221
+ if word_group is None:
222
+ word_group = [entity]
223
+ elif entity["is_subword"]:
224
+ word_group.append(entity)
225
+ else:
226
+ word_entities.append(self.aggregate_word(word_group, token_offsets[idx]))
227
+ word_group = [entity]
228
+ idx += 1
229
+ word_entities.append(self.aggregate_word(word_group, token_offsets[idx]))
230
+ idx += 1
231
+ return word_entities
232
+
233
+ def group_entities(self, entities: List[dict], sentence: str) -> List[dict]:
234
+ """
235
+ Find and group together the adjacent tokens with the same entity predicted.
236
+ Args:
237
+ entities (`dict`): The entities predicted by the pipeline.
238
+ """
239
+ entity_chunk = []
240
+ entity_chunk_disagg = []
241
+
242
+ for entity in entities:
243
+ if not entity_chunk_disagg:
244
+ entity_chunk_disagg.append(entity)
245
+ continue
246
+
247
+ bi_ref, tag_ref = self.get_tag(entity["entity_ref"])
248
+ last_bi_ref, last_tag_ref = self.get_tag(entity_chunk_disagg[-1]["entity_ref"])
249
+
250
+ if tag_ref == last_tag_ref and bi_ref != "B":
251
+ entity_chunk_disagg.append(entity)
252
+ else:
253
+ entity_chunk.append(entity_chunk_disagg)
254
+ entity_chunk_disagg = [entity]
255
+
256
+ if entity_chunk_disagg:
257
+ entity_chunk.append(entity_chunk_disagg)
258
+
259
+ entity_chunks_all = []
260
+
261
+ for chunk in entity_chunk:
262
+
263
+ entity_groups = []
264
+ entity_group_disagg = []
265
+
266
+ for entity in chunk:
267
+ if not entity_group_disagg:
268
+ entity_group_disagg.append(entity)
269
+ continue
270
+
271
+ bi_seg, tag_seg = self.get_tag(entity["entity_seg"])
272
+ last_bi_seg, last_tag_seg = self.get_tag(entity_group_disagg[-1]["entity_seg"])
273
+
274
+ if tag_seg == last_tag_seg and bi_seg != "B":
275
+ entity_group_disagg.append(entity)
276
+ else:
277
+ entity_groups.append(self.group_sub_entities(entity_group_disagg, sentence))
278
+ entity_group_disagg = [entity]
279
+
280
+ if entity_group_disagg:
281
+ entity_groups.append(self.group_sub_entities(entity_group_disagg, sentence))
282
+
283
+ entity_chunks_all.append(entity_groups)
284
+
285
+ return entity_chunks_all
286
+
287
+ def group_sub_entities(self, entities: List[dict], sentence: str) -> dict:
288
+ """
289
+ Group together the adjacent tokens with the same entity predicted.
290
+ Args:
291
+ entities (`dict`): The entities predicted by the pipeline.
292
+ """
293
+ entity = entities[0]["entity_seg"].split("-")[-1]
294
+ scores = np.nanmean([entity["score_seg"] for entity in entities])
295
+ start = min([entity["start"] for entity in entities])
296
+ end = max([entity["end"] for entity in entities])
297
+ word = sentence[start:end]
298
+
299
+
300
+
301
+ entity_group = {
302
+ "entity_group": entity,
303
+ "score": np.mean(scores),
304
+ "word": word,
305
+ "start": entities[0]["start"],
306
+ "end": entities[-1]["end"],
307
+ }
308
+ return entity_group
309
+
310
+ def get_tag(self, entity_name: str) -> Tuple[str, str]:
311
+ if entity_name.startswith("B-"):
312
+ bi = "B"
313
+ tag = entity_name[2:]
314
+ elif entity_name.startswith("I-"):
315
+ bi = "I"
316
+ tag = entity_name[2:]
317
+ else:
318
+ bi = "I"
319
+ tag = entity_name
320
+ return bi, tag
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62c24cdc13d4c9952d63718d6c9fa4c287974249e16b7ade6d5a85e7bbb75626
3
+ size 17082660
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "__type": "AddedToken",
7
+ "content": "<mask>",
8
+ "lstrip": true,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "model_max_length": 512,
14
+ "name_or_path": "xlm-roberta-base",
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "special_tokens_map_file": null,
18
+ "tokenizer_class": "XLMRobertaTokenizer",
19
+ "unk_token": "<unk>"
20
+ }