|
|
@ -241,7 +241,6 @@ class ParallelScalingBlock(nn.Module):
|
|
|
|
self.fast_attn = hasattr(torch.nn.functional, 'scaled_dot_product_attention') # FIXME
|
|
|
|
self.fast_attn = hasattr(torch.nn.functional, 'scaled_dot_product_attention') # FIXME
|
|
|
|
mlp_hidden_dim = int(mlp_ratio * dim)
|
|
|
|
mlp_hidden_dim = int(mlp_ratio * dim)
|
|
|
|
in_proj_out_dim = mlp_hidden_dim + 3 * dim
|
|
|
|
in_proj_out_dim = mlp_hidden_dim + 3 * dim
|
|
|
|
out_proj_in_dim = mlp_hidden_dim + dim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.in_norm = norm_layer(dim)
|
|
|
|
self.in_norm = norm_layer(dim)
|
|
|
|
self.in_proj = nn.Linear(dim, in_proj_out_dim, bias=qkv_bias)
|
|
|
|
self.in_proj = nn.Linear(dim, in_proj_out_dim, bias=qkv_bias)
|
|
|
|