BlackSamorez
commited on
Commit
·
bd1c718
1
Parent(s):
34d341b
small fixes and tokenizer config
Browse files- configuration_yalm.py +1 -1
- modeling_yalm.py → modelling_yalm.py +7 -7
- tokenizer_config.json +9 -0
configuration_yalm.py
CHANGED
@@ -106,7 +106,7 @@ class YalmConfig(PretrainedConfig):
|
|
106 |
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
107 |
self.activation_type = activation_type
|
108 |
self.max_position_embeddings = max_position_embeddings
|
109 |
-
self.apply_residual_connection_post_layernorm =
|
110 |
self.initializer_range = initializer_range
|
111 |
self.layernorm_epsilon = layernorm_epsilon
|
112 |
self.attention_dropout = attention_dropout
|
|
|
106 |
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
107 |
self.activation_type = activation_type
|
108 |
self.max_position_embeddings = max_position_embeddings
|
109 |
+
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
|
110 |
self.initializer_range = initializer_range
|
111 |
self.layernorm_epsilon = layernorm_epsilon
|
112 |
self.attention_dropout = attention_dropout
|
modeling_yalm.py → modelling_yalm.py
RENAMED
@@ -327,7 +327,7 @@ class YalmSelfAttention(nn.Module):
|
|
327 |
attention_scores += attention_mask
|
328 |
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
329 |
|
330 |
-
|
331 |
|
332 |
# =========================
|
333 |
# Context layer. [sq, b, hp]
|
@@ -498,9 +498,9 @@ class YalmTransformerLayer(nn.Module):
|
|
498 |
else:
|
499 |
residual = hidden_states
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
layernorm_input = attention_output + residual
|
505 |
|
506 |
# Layer norm post the self attention.
|
@@ -510,9 +510,9 @@ class YalmTransformerLayer(nn.Module):
|
|
510 |
mlp_output = self.mlp(layernorm_output)
|
511 |
residual = layernorm_input
|
512 |
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
output = mlp_output + residual
|
517 |
|
518 |
if use_cache:
|
|
|
327 |
attention_scores += attention_mask
|
328 |
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
329 |
|
330 |
+
attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
|
331 |
|
332 |
# =========================
|
333 |
# Context layer. [sq, b, hp]
|
|
|
498 |
else:
|
499 |
residual = hidden_states
|
500 |
|
501 |
+
attention_output = torch.nn.functional.dropout(
|
502 |
+
attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
|
503 |
+
)
|
504 |
layernorm_input = attention_output + residual
|
505 |
|
506 |
# Layer norm post the self attention.
|
|
|
510 |
mlp_output = self.mlp(layernorm_output)
|
511 |
residual = layernorm_input
|
512 |
|
513 |
+
mlp_output = torch.nn.functional.dropout(
|
514 |
+
mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
|
515 |
+
)
|
516 |
output = mlp_output + residual
|
517 |
|
518 |
if use_cache:
|
tokenizer_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": ["tokenization_yalm.YalmTokenizer", null]
|
4 |
+
},
|
5 |
+
"tokenizer_class": "YalmTokenizer",
|
6 |
+
"bos_token": "<s>",
|
7 |
+
"eos_token": "</s>",
|
8 |
+
"unk_token": "<unk>"
|
9 |
+
}
|