commit files to HF hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
config.json +41 -0
pytorch_model.bin +3 -0
ref_seg.py +292 -0
special_tokens_map.json +15 -0
tokenizer.json +3 -0
tokenizer_config.json +20 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "MrPotato/ref-seg-ger_large_tokenized",
+  "alpha": 0.5,
+  "architectures": [
+    "XLMRobertaForReferenceSegmentation"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "custom_pipelines": {
+    "ref-seg": {
+      "impl": "ref_seg.RefSegPipeline",
+      "pt": [
+        "AutoModelForTokenClassification"
+      ],
+      "tf": [
+        "TFAutoModelForTokenClassification"
+      ]
+    }
+  },
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_labels_first": 29,
+  "num_labels_second": 2,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7dbcc9cd8cad6d81cba90b6b3e510410adfb3c9a8ab28fbca81708bd63688c
+size 2235624885

ref_seg.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from transformers import AutoTokenizer, XLMRobertaForTokenClassification, Pipeline, AutoModelForTokenClassification, AutoModel, XLMRobertaTokenizerFast
+from tokenizers.pre_tokenizers import Whitespace
+from transformers.pipelines import PIPELINE_REGISTRY
+from itertools import chain
+from colorama import Fore, Back
+from colorama import Style
+import numpy as np
+from transformers.models.xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
+from transformers.models.roberta import RobertaConfig
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers import PretrainedConfig
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from typing import List, Optional, Tuple, Union
+class RefSegPipeline(Pipeline):
+    labels = [
+        'publisher', 'source', 'url', 'other', 'author', 'editor', 'lpage',
+        'volume', 'year', 'issue', 'title', 'fpage', 'edition'
+    ]
+    iob_labels = list(chain.from_iterable([['B-' + x, 'I-' + x] for x in labels])) + ['O']
+    id2seg = {k: v for k, v in enumerate(iob_labels)}
+    id2ref = {k: v for k, v in enumerate(['B-ref', 'I-ref', ])}
+    def _sanitize_parameters(self, **kwargs):
+        if "id2seg" in kwargs:
+            self.id2seg = kwargs["id2seg"]
+        if "id2ref" in kwargs:
+            self.id2ref = kwargs["id2ref"]
+        return {}, {}, {}
+    def preprocess(self, sentence, offset_mapping=None):
+        model_inputs = self.tokenizer(
+            sentence,
+            return_offsets_mapping=True,
+            padding='max_length',
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+            return_special_tokens_mask=True,
+            return_overflowing_tokens=True
+        )
+        if offset_mapping:
+            model_inputs["offset_mapping"] = offset_mapping
+        model_inputs["sentence"] = sentence
+        return model_inputs
+    def _forward(self, model_inputs):
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        overflow_mapping = model_inputs.pop("overflow_to_sample_mapping")
+        if self.framework == "tf":
+            logits = self.model(model_inputs.data)[0]
+        else:
+            logits = self.model(**model_inputs)[0]
+        return {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "overflow_mapping": overflow_mapping,
+            "sentence": sentence,
+            **model_inputs,
+        }
+    def postprocess(self, model_outputs):
+        # if ignore_labels is None:
+        ignore_labels = ["O"]
+        logits_seg = model_outputs["logits"][0].numpy()
+        logits_ref = model_outputs["logits"][1].numpy()
+        sentence = model_outputs["sentence"]
+        input_ids = model_outputs["input_ids"]
+        special_tokens_mask = model_outputs["special_tokens_mask"]
+        overflow_mapping = model_outputs["overflow_mapping"]
+        offset_mapping = model_outputs["offset_mapping"] if model_outputs["offset_mapping"] is not None else None
+        maxes_seg = np.max(logits_seg, axis=-1, keepdims=True)
+        shifted_exp_seg = np.exp(logits_seg - maxes_seg)
+        scores_seg = shifted_exp_seg / shifted_exp_seg.sum(axis=-1, keepdims=True)
+        maxes_ref = np.max(logits_ref, axis=-1, keepdims=True)
+        shifted_exp_ref = np.exp(logits_ref - maxes_ref)
+        scores_ref = shifted_exp_ref / shifted_exp_ref.sum(axis=-1, keepdims=True)
+        pre_entities = self.gather_pre_entities(
+            sentence, input_ids, scores_seg, scores_ref, offset_mapping, special_tokens_mask
+        )
+        grouped_entities = self.aggregate(pre_entities)
+        cleaned_groups = []
+        for group in grouped_entities:
+            entities = [
+                entity
+                for entity in group
+                if entity.get("entity_group", None) not in ignore_labels
+            ]
+            cleaned_groups.append(entities)
+        return {
+            "number_of_references": len(cleaned_groups),
+            "references": cleaned_groups,
+        }
+    def gather_pre_entities(
+            self,
+            sentence: str,
+            input_ids: np.ndarray,
+            scores_seg: np.ndarray,
+            scores_ref: np.ndarray,
+            offset_mappings: Optional[List[Tuple[int, int]]],
+            special_tokens_masks: np.ndarray,
+    ) -> List[dict]:
+        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+        pre_entities = []
+        for idx_list, (input_id, offset_mapping, special_tokens_mask, s_seg, s_ref) in enumerate(
+                zip(input_ids, offset_mappings, special_tokens_masks, scores_seg, scores_ref)):
+            for idx, iid in enumerate(input_id):
+                if special_tokens_mask[idx]:
+                    continue
+                word = self.tokenizer.convert_ids_to_tokens(int(input_id[idx]))
+                if offset_mapping is not None:
+                    start_ind, end_ind = offset_mapping[idx]
+                    if not isinstance(start_ind, int):
+                        if self.framework == "pt":
+                            start_ind = start_ind.item()
+                            end_ind = end_ind.item()
+                    word_ref = sentence[start_ind:end_ind]
+                    if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
+                        is_subword = len(word) != len(word_ref)
+                    else:
+                        is_subword = len(word) == len(word_ref)
+                    if int(input_id[idx]) == self.tokenizer.unk_token_id:
+                        word = word_ref
+                        is_subword = False
+                else:
+                    start_ind = None
+                    end_ind = None
+                    is_subword = False
+                pre_entity = {
+                    "word": word,
+                    "scores_seg": s_seg[idx],
+                    "scores_ref": s_ref[idx],
+                    "start": start_ind,
+                    "end": end_ind,
+                    "index": idx,
+                    "is_subword": is_subword,
+                }
+                pre_entities.append(pre_entity)
+        return pre_entities
+    def aggregate(self, pre_entities: List[dict]) -> List[dict]:
+        entities = self.aggregate_words(pre_entities)
+        return self.group_entities(entities)
+    def aggregate_word(self, entities: List[dict]) -> dict:
+        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+        scores_seg = entities[0]["scores_seg"]
+        idx_seg = scores_seg.argmax()
+        score_seg = scores_seg[idx_seg]
+        entity_seg = self.id2seg[idx_seg]
+        scores_ref = np.stack([entity["scores_ref"] for entity in entities])
+        indices_ref = scores_ref.argmax(axis=1)
+        idx_ref = 1 if all(indices_ref) else 0
+        # score_ref = 1
+        entity_ref = self.id2ref[idx_ref]
+        new_entity = {
+            "entity_seg": entity_seg,
+            "score_seg": score_seg,
+            "entity_ref": entity_ref,
+            # "score_ref": score_ref,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+    def aggregate_words(self, entities: List[dict]) -> List[dict]:
+        """
+        Override tokens from a given word that disagree to force agreement on word boundaries.
+        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+        company| B-ENT I-ENT
+        """
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self.aggregate_word(word_group))
+                word_group = [entity]
+        word_entities.append(self.aggregate_word(word_group))
+        return word_entities
+    def group_entities(self, entities: List[dict]) -> List[dict]:
+        """
+        Find and group together the adjacent tokens with the same entity predicted.
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        entity_chunk = []
+        entity_chunk_disagg = []
+        for entity in entities:
+            if not entity_chunk_disagg:
+                entity_chunk_disagg.append(entity)
+                continue
+            bi_ref, tag_ref = self.get_tag(entity["entity_ref"])
+            last_bi_ref, last_tag_ref = self.get_tag(entity_chunk_disagg[-1]["entity_ref"])
+            if tag_ref == last_tag_ref and bi_ref != "B":
+                entity_chunk_disagg.append(entity)
+            else:
+                entity_chunk.append(entity_chunk_disagg)
+                entity_chunk_disagg = [entity]
+        if entity_chunk_disagg:
+            entity_chunk.append(entity_chunk_disagg)
+        entity_chunks_all = []
+        for chunk in entity_chunk:
+            entity_groups = []
+            entity_group_disagg = []
+            for entity in chunk:
+                if not entity_group_disagg:
+                    entity_group_disagg.append(entity)
+                    continue
+                bi_seg, tag_seg = self.get_tag(entity["entity_seg"])
+                last_bi_seg, last_tag_seg = self.get_tag(entity_group_disagg[-1]["entity_seg"])
+                if tag_seg == last_tag_seg and bi_seg != "B":
+                    entity_group_disagg.append(entity)
+                else:
+                    entity_groups.append(self.group_sub_entities(entity_group_disagg))
+                    entity_group_disagg = [entity]
+            if entity_group_disagg:
+                entity_groups.append(self.group_sub_entities(entity_group_disagg))
+            entity_chunks_all.append(entity_groups)
+        return entity_chunks_all
+    def group_sub_entities(self, entities: List[dict]) -> dict:
+        """
+        Group together the adjacent tokens with the same entity predicted.
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        entity = entities[0]["entity_seg"].split("-")[-1]
+        scores = np.nanmean([entity["score_seg"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": " ".join(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+    def get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            bi = "I"
+            tag = entity_name
+        return bi, tag

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62c24cdc13d4c9952d63718d6c9fa4c287974249e16b7ade6d5a85e7bbb75626
+size 17082660

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "xlm-roberta-large",
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}