Use approximate GELU from PyTorch
Browse files- modeling_norbert.py +1 -3
modeling_norbert.py
CHANGED
|
@@ -8,7 +8,6 @@ from torch.utils import checkpoint
|
|
| 8 |
|
| 9 |
from .configuration_norbert import NorbertConfig
|
| 10 |
from transformers.modeling_utils import PreTrainedModel
|
| 11 |
-
from transformers.activations import gelu_new
|
| 12 |
from transformers.modeling_outputs import (
|
| 13 |
MaskedLMOutput,
|
| 14 |
MultipleChoiceModelOutput,
|
|
@@ -17,7 +16,6 @@ from transformers.modeling_outputs import (
|
|
| 17 |
TokenClassifierOutput,
|
| 18 |
BaseModelOutput
|
| 19 |
)
|
| 20 |
-
from transformers.pytorch_utils import softmax_backward_data
|
| 21 |
|
| 22 |
|
| 23 |
class Encoder(nn.Module):
|
|
@@ -81,7 +79,7 @@ class EncoderLayer(nn.Module):
|
|
| 81 |
class GeGLU(nn.Module):
|
| 82 |
def forward(self, x):
|
| 83 |
x, gate = x.chunk(2, dim=-1)
|
| 84 |
-
x = x *
|
| 85 |
return x
|
| 86 |
|
| 87 |
|
|
|
|
| 8 |
|
| 9 |
from .configuration_norbert import NorbertConfig
|
| 10 |
from transformers.modeling_utils import PreTrainedModel
|
|
|
|
| 11 |
from transformers.modeling_outputs import (
|
| 12 |
MaskedLMOutput,
|
| 13 |
MultipleChoiceModelOutput,
|
|
|
|
| 16 |
TokenClassifierOutput,
|
| 17 |
BaseModelOutput
|
| 18 |
)
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class Encoder(nn.Module):
|
|
|
|
| 79 |
class GeGLU(nn.Module):
|
| 80 |
def forward(self, x):
|
| 81 |
x, gate = x.chunk(2, dim=-1)
|
| 82 |
+
x = x * F.gelu(gate, approximate="tanh")
|
| 83 |
return x
|
| 84 |
|
| 85 |
|