commit files to HF hub

Browse files

Files changed (9) hide show

.gitattributes +1 -0
config.json +45 -0
configuration_refseg.py +111 -0
modeling_refseg.py +82 -0
pytorch_model.bin +3 -0
ref_seg.py +320 -0
special_tokens_map.json +15 -0
tokenizer.json +3 -0
tokenizer_config.json +20 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_name_or_path": "MrPotato/reference-segmentation-xlm-roberta-geocite-v2",
+  "alpha": 0.5,
+  "architectures": [
+    "XLMRobertaForReferenceSegmentation"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_refseg.XLMRobertaRefSegConfig",
+    "AutoModelForTokenClassification": "modeling_refseg.XLMRobertaForReferenceSegmentation"
+  },
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "custom_pipelines": {
+    "ref-seg": {
+      "impl": "ref_seg.RefSegPipeline",
+      "pt": [
+        "AutoModelForTokenClassification"
+      ],
+      "tf": [
+        "TFAutoModelForTokenClassification"
+      ]
+    }
+  },
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_labels_first": 27,
+  "num_labels_second": 2,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

configuration_refseg.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from transformers import PretrainedConfig
+class XLMRobertaRefSegConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
+    is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
+    [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
+            [`TFXLMRobertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from transformers import XLMRobertaConfig, XLMRobertaModel
+    >>> # Initializing a XLM-RoBERTa xlm-roberta-base style configuration
+    >>> configuration = XLMRobertaConfig()
+    >>> # Initializing a model (with random weights) from the xlm-roberta-base style configuration
+    >>> model = XLMRobertaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "xlm-roberta"
+    def __init__(
+            self,
+            vocab_size=250002,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=514,
+            type_vocab_size=1,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            position_embedding_type="absolute",
+            use_cache=True,
+            classifier_dropout=None,
+            num_labels_first=29,
+            num_labels_second=2,
+            alpha=1.0,
+            **kwargs
+        ):
+            super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+            self.position_embedding_type = position_embedding_type
+            self.use_cache = use_cache
+            self.classifier_dropout = classifier_dropout
+            self.num_labels_first = num_labels_first
+            self.num_labels_second = num_labels_second
+            self.alpha = alpha
+            super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

modeling_refseg.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from transformers.models.xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
+from transformers.modeling_outputs import TokenClassifierOutput
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from typing import Optional, Tuple, Union
+class XLMRobertaForReferenceSegmentation(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels_first = config.num_labels_first
+        self.num_labels_second = config.num_labels_second
+        self.alpha = config.alpha
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier_first = nn.Linear(config.hidden_size, self.num_labels_first)
+        self.classifier_second = nn.Linear(config.hidden_size, self.num_labels_second)
+        self.post_init()
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels_first: Optional[torch.LongTensor] = None,
+            labels_second: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output_first = self.dropout(sequence_output)
+        logits_first = self.classifier_first(sequence_output_first)
+        sequence_output_second = self.dropout(sequence_output)
+        logits_second = self.classifier_second(sequence_output_second)
+        loss = None
+        if labels_first is not None and labels_second is not None:
+            loss_fct_first = CrossEntropyLoss()
+            loss_fct_second = CrossEntropyLoss()
+            loss_first = loss_fct_first(logits_first.view(-1, self.num_labels_first), labels_first.view(-1))
+            loss_second = loss_fct_second(logits_second.view(-1, self.num_labels_second), labels_second.view(-1))
+            loss = loss_first + (self.alpha * loss_second)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=[logits_first, logits_second],
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b084beb63e2011fd34eb19dde325fd75fa8d6141a80f68c6b98782e446526482
+size 1109971957

ref_seg.py ADDED Viewed

	@@ -0,0 +1,320 @@

+from itertools import chain
+from typing import List, Optional, Tuple
+import numpy as np
+from transformers import Pipeline
+class RefSegPipeline(Pipeline):
+    labels = [
+        'publisher', 'source', 'url', 'other', 'author', 'editor', 'lpage',
+        'volume', 'year', 'issue', 'title', 'fpage', 'edition'
+    ]
+    iob_labels = list(chain.from_iterable([['B-' + x, 'I-' + x] for x in labels])) + ['O']
+    id2seg = {k: v for k, v in enumerate(iob_labels)}
+    id2ref = {k: v for k, v in enumerate(['B-ref', 'I-ref', ])}
+    is_split_into_words = False
+    def _sanitize_parameters(self, **kwargs):
+        if "id2seg" in kwargs:
+            self.id2seg = kwargs["id2seg"]
+        if "id2ref" in kwargs:
+            self.id2ref = kwargs["id2ref"]
+        return {}, {}, {}
+    def preprocess(self, sentence, offset_mapping=None, split_into_words=True):
+        tokens = sentence
+        if split_into_words:
+            split_sentence = self.tokenizer.pre_tokenizer.pre_tokenize_str(sentence)
+            tokens, offsets = zip(*split_sentence)
+        model_inputs = self.tokenizer(
+            tokens,
+            return_offsets_mapping=True,
+            padding='max_length',
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+            return_special_tokens_mask=True,
+            return_overflowing_tokens=True,
+            is_split_into_words=split_into_words,
+            stride=32
+        )
+        if offset_mapping:
+            model_inputs["offset_mapping"] = offset_mapping
+        model_inputs["sentence"] = sentence
+        model_inputs["token_offsets"] = offsets
+        return model_inputs
+    def _forward(self, model_inputs):
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        token_offsets = model_inputs.pop("token_offsets")
+        overflow_mapping = model_inputs.pop("overflow_to_sample_mapping")
+        if self.framework == "tf":
+            logits = self.model(model_inputs.data)[0]
+        else:
+            logits = self.model(**model_inputs)[0]
+        return {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "overflow_mapping": overflow_mapping,
+            "sentence": sentence,
+            "token_offsets": token_offsets,
+            **model_inputs,
+        }
+    def postprocess(self, model_outputs):
+        # if ignore_labels is None:
+        ignore_labels = ["O"]
+        logits_seg = model_outputs["logits"][0].numpy()
+        logits_ref = model_outputs["logits"][1].numpy()
+        sentence = model_outputs["sentence"]
+        token_offsets = model_outputs["token_offsets"]
+        input_ids = model_outputs["input_ids"]
+        special_tokens_mask = model_outputs["special_tokens_mask"]
+        offset_mapping = model_outputs["offset_mapping"] if model_outputs["offset_mapping"] is not None else None
+        maxes_seg = np.max(logits_seg, axis=-1, keepdims=True)
+        shifted_exp_seg = np.exp(logits_seg - maxes_seg)
+        scores_seg = shifted_exp_seg / shifted_exp_seg.sum(axis=-1, keepdims=True)
+        maxes_ref = np.max(logits_ref, axis=-1, keepdims=True)
+        shifted_exp_ref = np.exp(logits_ref - maxes_ref)
+        scores_ref = shifted_exp_ref / shifted_exp_ref.sum(axis=-1, keepdims=True)
+        pre_entities = self.gather_pre_entities(
+            input_ids, scores_seg, scores_ref, offset_mapping, special_tokens_mask
+        )
+        grouped_entities = self.aggregate(pre_entities, token_offsets, sentence)
+        cleaned_groups = []
+        for group in grouped_entities:
+            start, end = None, None
+            entities = []
+            group_dict = {}
+            for entity in group:
+                if entity.get("entity_group", None) in ignore_labels:
+                    continue
+                if start is None or end is None:
+                    start = entity["start"]
+                    end = entity["end"]
+                else:
+                    start = min(start, entity["start"])
+                    end = max(end, entity["end"])
+                entities.append(entity)
+            if entities:
+                group_dict["reference_raw"] = sentence[start:end]
+                group_dict["entities"] = entities
+                cleaned_groups.append(group_dict)
+            # entities = [
+            #     entity
+            #     for entity in group
+            #     if entity.get("entity_group", None) not in ignore_labels
+            # ]
+            # if entities:
+            #     cleaned_groups.append(entities)
+        return {
+            "number_of_references": len(cleaned_groups),
+            "references": cleaned_groups,
+        }
+    def gather_pre_entities(
+            self,
+            input_ids: np.ndarray,
+            scores_seg: np.ndarray,
+            scores_ref: np.ndarray,
+            offset_mappings: Optional[List[Tuple[int, int]]],
+            special_tokens_masks: np.ndarray,
+    ) -> List[dict]:
+        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+        pre_entities = []
+        for idx_list, (input_id, offset_mapping, special_tokens_mask, s_seg, s_ref) in enumerate(
+                zip(input_ids, offset_mappings, special_tokens_masks, scores_seg, scores_ref)):
+            for idx, iid in enumerate(input_id):
+                skip = False
+                if idx_list != 0 and idx <= 32:
+                    skip = True
+                if special_tokens_mask[idx]:
+                    continue
+                word = self.tokenizer.convert_ids_to_tokens(int(input_id[idx]))
+                if offset_mapping is not None:
+                    start_ind, end_ind = offset_mapping[idx]
+                    if not isinstance(start_ind, int):
+                        if self.framework == "pt":
+                            start_ind = start_ind.item()
+                            end_ind = end_ind.item()
+                    is_subword = not word.startswith('\u2581')
+                    if int(input_id[idx]) == self.tokenizer.unk_token_id:
+                        is_subword = False
+                else:
+                    start_ind = None
+                    end_ind = None
+                    is_subword = False
+                pre_entity = {
+                    "word": word,
+                    "scores_seg": s_seg[idx],
+                    "scores_ref": s_ref[idx],
+                    "start": start_ind,
+                    "end": end_ind,
+                    "index": idx,
+                    "is_subword": is_subword,
+                    "is_stride": skip,
+                }
+                pre_entities.append(pre_entity)
+        return pre_entities
+    def aggregate(self, pre_entities: List[dict], token_offsets: List[tuple], sentence: str) -> List[dict]:
+        entities = self.aggregate_words(pre_entities, token_offsets)
+        return self.group_entities(entities, sentence)
+    def aggregate_word(self, entities: List[dict], token_offset: tuple) -> dict:
+        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+        scores_seg = entities[0]["scores_seg"]
+        idx_seg = scores_seg.argmax()
+        score_seg = scores_seg[idx_seg]
+        entity_seg = self.id2seg[idx_seg]
+        scores_ref = np.stack([entity["scores_ref"] for entity in entities])
+        indices_ref = scores_ref.argmax(axis=1)
+        idx_ref = 1 if all(indices_ref) else 0
+        entity_ref = self.id2ref[idx_ref]
+        new_entity = {
+            "entity_seg": entity_seg,
+            "score_seg": score_seg,
+            "entity_ref": entity_ref,
+            "word": word,
+            "start": entities[0]["start"] + token_offset[0],
+            "end": entities[-1]["end"] + token_offset[0],
+        }
+        return new_entity
+    def aggregate_words(self, entities: List[dict], token_offsets: List[tuple]) -> List[dict]:
+        """
+        Override tokens from a given word that disagree to force agreement on word boundaries.
+        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+        company| B-ENT I-ENT
+        """
+        word_entities = []
+        word_group = None
+        idx = 0
+        for entity in entities:
+            if entity["is_stride"]:
+                continue
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self.aggregate_word(word_group, token_offsets[idx]))
+                word_group = [entity]
+                idx += 1
+        word_entities.append(self.aggregate_word(word_group, token_offsets[idx]))
+        idx += 1
+        return word_entities
+    def group_entities(self, entities: List[dict], sentence: str) -> List[dict]:
+        """
+        Find and group together the adjacent tokens with the same entity predicted.
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        entity_chunk = []
+        entity_chunk_disagg = []
+        for entity in entities:
+            if not entity_chunk_disagg:
+                entity_chunk_disagg.append(entity)
+                continue
+            bi_ref, tag_ref = self.get_tag(entity["entity_ref"])
+            last_bi_ref, last_tag_ref = self.get_tag(entity_chunk_disagg[-1]["entity_ref"])
+            if tag_ref == last_tag_ref and bi_ref != "B":
+                entity_chunk_disagg.append(entity)
+            else:
+                entity_chunk.append(entity_chunk_disagg)
+                entity_chunk_disagg = [entity]
+        if entity_chunk_disagg:
+            entity_chunk.append(entity_chunk_disagg)
+        entity_chunks_all = []
+        for chunk in entity_chunk:
+            entity_groups = []
+            entity_group_disagg = []
+            for entity in chunk:
+                if not entity_group_disagg:
+                    entity_group_disagg.append(entity)
+                    continue
+                bi_seg, tag_seg = self.get_tag(entity["entity_seg"])
+                last_bi_seg, last_tag_seg = self.get_tag(entity_group_disagg[-1]["entity_seg"])
+                if tag_seg == last_tag_seg and bi_seg != "B":
+                    entity_group_disagg.append(entity)
+                else:
+                    entity_groups.append(self.group_sub_entities(entity_group_disagg, sentence))
+                    entity_group_disagg = [entity]
+            if entity_group_disagg:
+                entity_groups.append(self.group_sub_entities(entity_group_disagg, sentence))
+            entity_chunks_all.append(entity_groups)
+        return entity_chunks_all
+    def group_sub_entities(self, entities: List[dict], sentence: str) -> dict:
+        """
+        Group together the adjacent tokens with the same entity predicted.
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        entity = entities[0]["entity_seg"].split("-")[-1]
+        scores = np.nanmean([entity["score_seg"] for entity in entities])
+        start = min([entity["start"] for entity in entities])
+        end = max([entity["end"] for entity in entities])
+        word = sentence[start:end]
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+    def get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            bi = "I"
+            tag = entity_name
+        return bi, tag

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62c24cdc13d4c9952d63718d6c9fa4c287974249e16b7ade6d5a85e7bbb75626
+size 17082660

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "xlm-roberta-base",
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}