Simplified the model by always computing batch-first
Browse files- modeling_norbert.py +1 -1
modeling_norbert.py
CHANGED
@@ -125,7 +125,7 @@ class Attention(nn.Module):
|
|
125 |
- torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
|
126 |
position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
|
127 |
position_indices = config.position_bucket_size - 1 + position_indices
|
128 |
-
self.register_buffer("position_indices", position_indices, persistent=False)
|
129 |
|
130 |
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
131 |
self.scale = 1.0 / math.sqrt(3 * self.head_size)
|
|
|
125 |
- torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
|
126 |
position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
|
127 |
position_indices = config.position_bucket_size - 1 + position_indices
|
128 |
+
self.register_buffer("position_indices", position_indices.contiguous(), persistent=False)
|
129 |
|
130 |
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
131 |
self.scale = 1.0 / math.sqrt(3 * self.head_size)
|