BlackSamorez commited on
Commit
bd1c718
·
1 Parent(s): 34d341b

small fixes and tokenizer config

Browse files
configuration_yalm.py CHANGED
@@ -106,7 +106,7 @@ class YalmConfig(PretrainedConfig):
106
  self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
107
  self.activation_type = activation_type
108
  self.max_position_embeddings = max_position_embeddings
109
- self.apply_residual_connection_post_layernorm = False
110
  self.initializer_range = initializer_range
111
  self.layernorm_epsilon = layernorm_epsilon
112
  self.attention_dropout = attention_dropout
 
106
  self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
107
  self.activation_type = activation_type
108
  self.max_position_embeddings = max_position_embeddings
109
+ self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
110
  self.initializer_range = initializer_range
111
  self.layernorm_epsilon = layernorm_epsilon
112
  self.attention_dropout = attention_dropout
modeling_yalm.py → modelling_yalm.py RENAMED
@@ -327,7 +327,7 @@ class YalmSelfAttention(nn.Module):
327
  attention_scores += attention_mask
328
  attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
329
 
330
- # attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
331
 
332
  # =========================
333
  # Context layer. [sq, b, hp]
@@ -498,9 +498,9 @@ class YalmTransformerLayer(nn.Module):
498
  else:
499
  residual = hidden_states
500
 
501
- # attention_output = torch.nn.functional.dropout(
502
- # attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
503
- # )
504
  layernorm_input = attention_output + residual
505
 
506
  # Layer norm post the self attention.
@@ -510,9 +510,9 @@ class YalmTransformerLayer(nn.Module):
510
  mlp_output = self.mlp(layernorm_output)
511
  residual = layernorm_input
512
 
513
- # mlp_output = torch.nn.functional.dropout(
514
- # mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
515
- # )
516
  output = mlp_output + residual
517
 
518
  if use_cache:
 
327
  attention_scores += attention_mask
328
  attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
329
 
330
+ attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
331
 
332
  # =========================
333
  # Context layer. [sq, b, hp]
 
498
  else:
499
  residual = hidden_states
500
 
501
+ attention_output = torch.nn.functional.dropout(
502
+ attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
503
+ )
504
  layernorm_input = attention_output + residual
505
 
506
  # Layer norm post the self attention.
 
510
  mlp_output = self.mlp(layernorm_output)
511
  residual = layernorm_input
512
 
513
+ mlp_output = torch.nn.functional.dropout(
514
+ mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
515
+ )
516
  output = mlp_output + residual
517
 
518
  if use_cache:
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": ["tokenization_yalm.YalmTokenizer", null]
4
+ },
5
+ "tokenizer_class": "YalmTokenizer",
6
+ "bos_token": "<s>",
7
+ "eos_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }