OlivierDehaene commited on
Commit
7d2ded6
·
1 Parent(s): 979206f

use torch.nn.functional.gelu instead

Browse files
Files changed (1) hide show
  1. modeling_gpt2_mq.py +2 -14
modeling_gpt2_mq.py CHANGED
@@ -71,26 +71,14 @@ def prepare_attn_mask(
71
  return combined_attention_mask
72
 
73
 
74
- @torch.jit.script
75
- def gelu_forward(x: torch.Tensor) -> torch.Tensor:
76
- """
77
- Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
78
- make the model jitable.
79
-
80
- Args:
81
- x (`torch.tensor`, *required*):
82
- input hidden states
83
- """
84
- return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
85
-
86
-
87
  class LinearGPT2MLP(nn.Module):
88
  def __init__(self, intermediate_size, config):
89
  super().__init__()
90
  embed_dim = config.hidden_size
91
  self.c_fc = nn.Linear(embed_dim, intermediate_size)
92
  self.c_proj = nn.Linear(intermediate_size, embed_dim)
93
- self.act = ACT2FN[config.activation_function] if "gelu" not in config.activation_function else gelu_forward
 
94
  self.dropout = nn.Dropout(config.resid_pdrop)
95
 
96
  def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
 
71
  return combined_attention_mask
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  class LinearGPT2MLP(nn.Module):
75
  def __init__(self, intermediate_size, config):
76
  super().__init__()
77
  embed_dim = config.hidden_size
78
  self.c_fc = nn.Linear(embed_dim, intermediate_size)
79
  self.c_proj = nn.Linear(intermediate_size, embed_dim)
80
+ self.act = ACT2FN[config.activation_function] if "gelu" not in config.activation_function else lambda \
81
+ x: torch.nn.functional.gelu(x, approximate="tanh")
82
  self.dropout = nn.Dropout(config.resid_pdrop)
83
 
84
  def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor: