kevinkrahn
commited on
Commit
•
ee1ab63
1
Parent(s):
07cb18b
Add new SentenceTransformer model.
Browse files- README.md +29 -66
- config.json +1 -1
- model.safetensors +1 -1
- modeling_hlm.py +19 -1
- tokenization_hlm.py +10 -16
README.md
CHANGED
@@ -6,30 +6,24 @@ tags:
|
|
6 |
- feature-extraction
|
7 |
- sentence-similarity
|
8 |
- transformers
|
9 |
-
- semantic-search
|
10 |
|
11 |
---
|
12 |
|
13 |
-
# shlm-grc-en
|
14 |
|
15 |
-
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
The base model uses a modified version of the HLM architecture described in [Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers](https://aclanthology.org/2024.sigtyp-1.16/)
|
20 |
-
|
21 |
-
This model is trained to produce sentence embeddings using the multilingual knowledge distillation method and datasets described in [Sentence Embedding Models for Ancient Greek Using Multilingual Knowledge Distillation](https://aclanthology.org/2023.alp-1.2/).
|
22 |
-
|
23 |
-
This model was distilled from `BAAI/bge-base-en-v1.5` for embedding English and Ancient Greek text.
|
24 |
|
25 |
## Usage (Sentence-Transformers)
|
26 |
|
27 |
-
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
|
34 |
```python
|
35 |
from sentence_transformers import SentenceTransformer
|
@@ -50,7 +44,7 @@ from transformers import AutoTokenizer, AutoModel
|
|
50 |
import torch
|
51 |
|
52 |
|
53 |
-
def cls_pooling(model_output):
|
54 |
return model_output[0][:,0]
|
55 |
|
56 |
|
@@ -58,8 +52,8 @@ def cls_pooling(model_output):
|
|
58 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
59 |
|
60 |
# Load model from HuggingFace Hub
|
61 |
-
|
62 |
-
|
63 |
|
64 |
# Tokenize sentences
|
65 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
@@ -69,61 +63,30 @@ with torch.no_grad():
|
|
69 |
model_output = model(**encoded_input)
|
70 |
|
71 |
# Perform pooling. In this case, cls pooling.
|
72 |
-
sentence_embeddings = cls_pooling(model_output)
|
73 |
|
74 |
print("Sentence embeddings:")
|
75 |
print(sentence_embeddings)
|
76 |
-
|
77 |
```
|
78 |
|
79 |
-
## Citing & Authors
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
```
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
editor = "Hahn, Michael and
|
87 |
-
Sorokin, Alexey and
|
88 |
-
Kumar, Ritesh and
|
89 |
-
Shcherbakov, Andreas and
|
90 |
-
Otmakhova, Yulia and
|
91 |
-
Yang, Jinrui and
|
92 |
-
Serikov, Oleg and
|
93 |
-
Rani, Priya and
|
94 |
-
Ponti, Edoardo M. and
|
95 |
-
Murado{\u{g}}lu, Saliha and
|
96 |
-
Gao, Rena and
|
97 |
-
Cotterell, Ryan and
|
98 |
-
Vylomova, Ekaterina",
|
99 |
-
booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
|
100 |
-
month = mar,
|
101 |
-
year = "2024",
|
102 |
-
address = "St. Julian's, Malta",
|
103 |
-
publisher = "Association for Computational Linguistics",
|
104 |
-
url = "https://aclanthology.org/2024.sigtyp-1.16",
|
105 |
-
pages = "131--141",
|
106 |
-
}
|
107 |
```
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
author = "Krahn, Kevin and
|
113 |
-
Tate, Derrick and
|
114 |
-
Lamicela, Andrew C.",
|
115 |
-
editor = "Anderson, Adam and
|
116 |
-
Gordin, Shai and
|
117 |
-
Li, Bin and
|
118 |
-
Liu, Yudong and
|
119 |
-
Passarotti, Marco C.",
|
120 |
-
booktitle = "Proceedings of the Ancient Language Processing Workshop",
|
121 |
-
month = sep,
|
122 |
-
year = "2023",
|
123 |
-
address = "Varna, Bulgaria",
|
124 |
-
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
|
125 |
-
url = "https://aclanthology.org/2023.alp-1.2",
|
126 |
-
pages = "13--22",
|
127 |
-
}
|
128 |
-
|
129 |
-
```
|
|
|
6 |
- feature-extraction
|
7 |
- sentence-similarity
|
8 |
- transformers
|
|
|
9 |
|
10 |
---
|
11 |
|
12 |
+
# kevinkrahn/shlm-grc-en
|
13 |
|
14 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
15 |
|
16 |
+
<!--- Describe your model here -->
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
## Usage (Sentence-Transformers)
|
19 |
|
20 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
21 |
|
22 |
+
```
|
23 |
+
pip install -U sentence-transformers
|
24 |
+
```
|
25 |
|
26 |
+
Then you can use the model like this:
|
27 |
|
28 |
```python
|
29 |
from sentence_transformers import SentenceTransformer
|
|
|
44 |
import torch
|
45 |
|
46 |
|
47 |
+
def cls_pooling(model_output, attention_mask):
|
48 |
return model_output[0][:,0]
|
49 |
|
50 |
|
|
|
52 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
53 |
|
54 |
# Load model from HuggingFace Hub
|
55 |
+
tokenizer = AutoTokenizer.from_pretrained('kevinkrahn/shlm-grc-en')
|
56 |
+
model = AutoModel.from_pretrained('kevinkrahn/shlm-grc-en')
|
57 |
|
58 |
# Tokenize sentences
|
59 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
63 |
model_output = model(**encoded_input)
|
64 |
|
65 |
# Perform pooling. In this case, cls pooling.
|
66 |
+
sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
|
67 |
|
68 |
print("Sentence embeddings:")
|
69 |
print(sentence_embeddings)
|
|
|
70 |
```
|
71 |
|
|
|
72 |
|
73 |
+
|
74 |
+
## Evaluation Results
|
75 |
+
|
76 |
+
<!--- Describe how your model was evaluated -->
|
77 |
+
|
78 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=kevinkrahn/shlm-grc-en)
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
## Full Model Architecture
|
83 |
```
|
84 |
+
SentenceTransformer(
|
85 |
+
(0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: HLMModel
|
86 |
+
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
87 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
```
|
89 |
|
90 |
+
## Citing & Authors
|
91 |
+
|
92 |
+
<!--- Describe where people can find more information -->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -26,7 +26,7 @@
|
|
26 |
"pad_token_id": 0,
|
27 |
"residual_word_embedding": false,
|
28 |
"torch_dtype": "float32",
|
29 |
-
"transformers_version": "4.
|
30 |
"type_vocab_size": 2,
|
31 |
"vocab_size": 512
|
32 |
}
|
|
|
26 |
"pad_token_id": 0,
|
27 |
"residual_word_embedding": false,
|
28 |
"torch_dtype": "float32",
|
29 |
+
"transformers_version": "4.46.1",
|
30 |
"type_vocab_size": 2,
|
31 |
"vocab_size": 512
|
32 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 379310632
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c673386ded6f25a1ec74e3cf31b244f46099d196877d3b0c949bd2c7f1e482ef
|
3 |
size 379310632
|
modeling_hlm.py
CHANGED
@@ -27,6 +27,8 @@ class HLMBaseModelOutput(ModelOutput):
|
|
27 |
|
28 |
|
29 |
class HLMEncoder(nn.Module):
|
|
|
|
|
30 |
def __init__(self, config) -> None:
|
31 |
super().__init__()
|
32 |
|
@@ -38,6 +40,17 @@ class HLMEncoder(nn.Module):
|
|
38 |
TransformerBlock(config, bias=i in sandwich_indices) for i in range(config.num_hidden_layers)])
|
39 |
for i in range(config.sandwich_size):
|
40 |
self.layers[sandwich_start_index + i*2+1].make_sandwich(self.layers[sandwich_start_index + i*2])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
else:
|
42 |
self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
|
43 |
|
@@ -62,8 +75,9 @@ class HLMEncoder(nn.Module):
|
|
62 |
def forward(self, hidden_states, attention_mask, freqs_cos, freqs_sin, return_dict=True, output_hidden_states=False):
|
63 |
all_hidden_states = []
|
64 |
attn_mask = self._get_attention_mask(attention_mask, hidden_states.dtype)
|
65 |
-
for layer in self.layers:
|
66 |
hidden_states = layer(hidden_states, attn_mask, freqs_cos, freqs_sin)
|
|
|
67 |
all_hidden_states.append(hidden_states)
|
68 |
|
69 |
if return_dict:
|
@@ -86,6 +100,7 @@ class HLMPreTrainedModel(PreTrainedModel):
|
|
86 |
base_model_prefix = "hlm"
|
87 |
_keys_to_ignore_on_load_unexpected = []
|
88 |
supports_gradient_checkpointing = True
|
|
|
89 |
|
90 |
def _init_weights(self, module):
|
91 |
"""Initialize the weights."""
|
@@ -293,6 +308,9 @@ class TransformerBlock(nn.Module):
|
|
293 |
def make_sandwich(self, other):
|
294 |
assert self.has_bias
|
295 |
assert not other.has_bias
|
|
|
|
|
|
|
296 |
self.q.weight = other.q.weight
|
297 |
self.k.weight = other.k.weight
|
298 |
self.v.weight = other.v.weight
|
|
|
27 |
|
28 |
|
29 |
class HLMEncoder(nn.Module):
|
30 |
+
_dynamic_tied_weights_keys = []
|
31 |
+
|
32 |
def __init__(self, config) -> None:
|
33 |
super().__init__()
|
34 |
|
|
|
40 |
TransformerBlock(config, bias=i in sandwich_indices) for i in range(config.num_hidden_layers)])
|
41 |
for i in range(config.sandwich_size):
|
42 |
self.layers[sandwich_start_index + i*2+1].make_sandwich(self.layers[sandwich_start_index + i*2])
|
43 |
+
tied_weights_keys = [
|
44 |
+
'q.weight',
|
45 |
+
'k.weight',
|
46 |
+
'v.weight',
|
47 |
+
'att_proj_linear.weight',
|
48 |
+
'ff_linear_1.weight',
|
49 |
+
'ff_linear_2.weight',
|
50 |
+
'ff_linear_3.weight',
|
51 |
+
]
|
52 |
+
for key in tied_weights_keys:
|
53 |
+
self._dynamic_tied_weights_keys.append(f'layers.{sandwich_start_index + i*2}.{key}')
|
54 |
else:
|
55 |
self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
|
56 |
|
|
|
75 |
def forward(self, hidden_states, attention_mask, freqs_cos, freqs_sin, return_dict=True, output_hidden_states=False):
|
76 |
all_hidden_states = []
|
77 |
attn_mask = self._get_attention_mask(attention_mask, hidden_states.dtype)
|
78 |
+
for i, layer in enumerate(self.layers):
|
79 |
hidden_states = layer(hidden_states, attn_mask, freqs_cos, freqs_sin)
|
80 |
+
#print(f'layer: {i}, bias: {layer.has_bias}, {hidden_states[0][0][0:2]}')
|
81 |
all_hidden_states.append(hidden_states)
|
82 |
|
83 |
if return_dict:
|
|
|
100 |
base_model_prefix = "hlm"
|
101 |
_keys_to_ignore_on_load_unexpected = []
|
102 |
supports_gradient_checkpointing = True
|
103 |
+
_supports_param_buffer_assignment = False
|
104 |
|
105 |
def _init_weights(self, module):
|
106 |
"""Initialize the weights."""
|
|
|
308 |
def make_sandwich(self, other):
|
309 |
assert self.has_bias
|
310 |
assert not other.has_bias
|
311 |
+
|
312 |
+
# TODO: change this to support buffers, because it breaks if _supports_param_buffer_assignment == True
|
313 |
+
# introduced in transformers 4.43 PR: https://github.com/huggingface/transformers/pull/31771
|
314 |
self.q.weight = other.q.weight
|
315 |
self.k.weight = other.k.weight
|
316 |
self.v.weight = other.v.weight
|
tokenization_hlm.py
CHANGED
@@ -59,8 +59,6 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
59 |
|
60 |
vocab_files_names = VOCAB_FILES_NAMES
|
61 |
model_input_names: List[str] = ["input_ids", "char_input_mask", "word_input_mask", "word_type_ids"]
|
62 |
-
padding_side: str = "right"
|
63 |
-
truncation_side: str = "right"
|
64 |
|
65 |
def __init__(
|
66 |
self,
|
@@ -116,7 +114,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
116 |
**kwargs,
|
117 |
)
|
118 |
self.unk_id = self.vocab["[UNK]"]
|
119 |
-
self.word_cls_token = word_cls_token
|
120 |
self.word_cls_token_id = self._convert_token_to_id(word_cls_token)
|
121 |
self.label_pad_token_id = -100
|
122 |
self.special_ids = [self._convert_token_to_id(token) for token in vocab_data["special_tokens"]]
|
@@ -374,7 +372,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
374 |
encoded_inputs["word_type_ids"] = self.create_token_type_ids_from_sequences(ids, pair_ids, add_special_tokens)
|
375 |
assert len(encoded_inputs["word_type_ids"]) == len(encoded_inputs["word_input_mask"])
|
376 |
|
377 |
-
# Always pad words
|
378 |
for word in encoded_inputs["input_ids"]:
|
379 |
if len(word) < self.max_word_length:
|
380 |
word.extend([self.pad_token_id] * (self.max_word_length - len(word)))
|
@@ -394,7 +392,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
394 |
)
|
395 |
|
396 |
return batch_outputs
|
397 |
-
|
398 |
def _encode_plus(
|
399 |
self,
|
400 |
text: Union[TextInput, PreTokenizedInput, EncodedInput],
|
@@ -552,7 +550,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
552 |
|
553 |
return BatchEncoding(batch_outputs)
|
554 |
|
555 |
-
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, split_long_words: bool = True) -> List[List[str]]:
|
556 |
text = unicodedata.normalize('NFKC', text)
|
557 |
if split_long_words:
|
558 |
tokenized_text = []
|
@@ -580,6 +578,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
580 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
581 |
#label_pad_token_id=-100,
|
582 |
verbose: bool = True,
|
|
|
583 |
) -> BatchEncoding:
|
584 |
# If we have a list of dicts, let's convert it in a dict of lists
|
585 |
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
@@ -630,7 +629,7 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
630 |
|
631 |
batch_outputs["word_input_mask"] = \
|
632 |
[f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs['word_input_mask']]
|
633 |
-
|
634 |
if "word_type_ids" in encoded_inputs:
|
635 |
batch_outputs["word_type_ids"] = [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs["word_type_ids"]]
|
636 |
|
@@ -652,13 +651,8 @@ class HLMTokenizer(PreTrainedTokenizer):
|
|
652 |
continue
|
653 |
labels = encoded_inputs[label_name]
|
654 |
label_pad_word = [[self.label_pad_token_id]*self.max_word_length]
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
else:
|
660 |
-
batch_outputs[label_name] = [
|
661 |
-
label_pad_word * (longest_in_batch - len(label)) + to_list(label) for label in labels
|
662 |
-
]
|
663 |
-
|
664 |
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
|
59 |
|
60 |
vocab_files_names = VOCAB_FILES_NAMES
|
61 |
model_input_names: List[str] = ["input_ids", "char_input_mask", "word_input_mask", "word_type_ids"]
|
|
|
|
|
62 |
|
63 |
def __init__(
|
64 |
self,
|
|
|
114 |
**kwargs,
|
115 |
)
|
116 |
self.unk_id = self.vocab["[UNK]"]
|
117 |
+
self.word_cls_token = word_cls_token
|
118 |
self.word_cls_token_id = self._convert_token_to_id(word_cls_token)
|
119 |
self.label_pad_token_id = -100
|
120 |
self.special_ids = [self._convert_token_to_id(token) for token in vocab_data["special_tokens"]]
|
|
|
372 |
encoded_inputs["word_type_ids"] = self.create_token_type_ids_from_sequences(ids, pair_ids, add_special_tokens)
|
373 |
assert len(encoded_inputs["word_type_ids"]) == len(encoded_inputs["word_input_mask"])
|
374 |
|
375 |
+
# Always pad words
|
376 |
for word in encoded_inputs["input_ids"]:
|
377 |
if len(word) < self.max_word_length:
|
378 |
word.extend([self.pad_token_id] * (self.max_word_length - len(word)))
|
|
|
392 |
)
|
393 |
|
394 |
return batch_outputs
|
395 |
+
|
396 |
def _encode_plus(
|
397 |
self,
|
398 |
text: Union[TextInput, PreTokenizedInput, EncodedInput],
|
|
|
550 |
|
551 |
return BatchEncoding(batch_outputs)
|
552 |
|
553 |
+
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, split_long_words: bool = True, **kwargs) -> List[List[str]]:
|
554 |
text = unicodedata.normalize('NFKC', text)
|
555 |
if split_long_words:
|
556 |
tokenized_text = []
|
|
|
578 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
579 |
#label_pad_token_id=-100,
|
580 |
verbose: bool = True,
|
581 |
+
**kwargs
|
582 |
) -> BatchEncoding:
|
583 |
# If we have a list of dicts, let's convert it in a dict of lists
|
584 |
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
|
|
629 |
|
630 |
batch_outputs["word_input_mask"] = \
|
631 |
[f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs['word_input_mask']]
|
632 |
+
|
633 |
if "word_type_ids" in encoded_inputs:
|
634 |
batch_outputs["word_type_ids"] = [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs["word_type_ids"]]
|
635 |
|
|
|
651 |
continue
|
652 |
labels = encoded_inputs[label_name]
|
653 |
label_pad_word = [[self.label_pad_token_id]*self.max_word_length]
|
654 |
+
batch_outputs[label_name] = [
|
655 |
+
to_list(label) + label_pad_word * (longest_in_batch - len(label)) for label in labels
|
656 |
+
]
|
657 |
+
|
|
|
|
|
|
|
|
|
|
|
658 |
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|