davda54 commited on
Commit
f1bcb7e
·
verified ·
1 Parent(s): ebc0bd8

Simplified the model by always computing batch-first

Browse files
Files changed (1) hide show
  1. modeling_norbert.py +1 -1
modeling_norbert.py CHANGED
@@ -125,7 +125,7 @@ class Attention(nn.Module):
125
  - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
126
  position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
127
  position_indices = config.position_bucket_size - 1 + position_indices
128
- self.register_buffer("position_indices", position_indices, persistent=False)
129
 
130
  self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
131
  self.scale = 1.0 / math.sqrt(3 * self.head_size)
 
125
  - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
126
  position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
127
  position_indices = config.position_bucket_size - 1 + position_indices
128
+ self.register_buffer("position_indices", position_indices.contiguous(), persistent=False)
129
 
130
  self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
131
  self.scale = 1.0 / math.sqrt(3 * self.head_size)