You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pytorch-image-models/convert/convert_nest_flax.py

107 lines
5.4 KiB

"""
Convert weights from https://github.com/google-research/nested-transformer
NOTE: You'll need https://github.com/google/CommonLoopUtils, not included in requirements.txt
"""
import numpy as np
import torch
from clu import checkpoint
arch_depths = {
'nest_base': [2, 2, 20],
'nest_small': [2, 2, 20],
'nest_tiny': [2, 2, 8],
}
def convert_nest(checkpoint_path, arch):
"""
Expects path to checkpoint which is a dir containing 4 files like in each of these folders
- https://console.cloud.google.com/storage/browser/gresearch/nest-checkpoints
`arch` is needed to
Returns a state dict that can be used with `torch.nn.Module.load_state_dict`
Hint: Follow timm.models.nest.Nest.__init__ and
https://github.com/google-research/nested-transformer/blob/main/models/nest_net.py
"""
assert arch in ['nest_base', 'nest_small', 'nest_tiny'], "Your `arch` is not supported"
flax_dict = checkpoint.load_state_dict(checkpoint_path)['optimizer']['target']
state_dict = {}
# Patch embedding
state_dict['patch_embed.proj.weight'] = torch.tensor(
flax_dict['PatchEmbedding_0']['Conv_0']['kernel']).permute(3, 2, 0, 1)
state_dict['patch_embed.proj.bias'] = torch.tensor(flax_dict['PatchEmbedding_0']['Conv_0']['bias'])
# Positional embeddings
posemb_keys = [k for k in flax_dict.keys() if k.startswith('PositionEmbedding')]
for i, k in enumerate(posemb_keys):
state_dict[f'pos_embed_{i}'] = torch.tensor(flax_dict[k]['pos_embedding'])
# Transformer encoders
depths = arch_depths[arch]
for level in range(len(depths)):
for layer in range(depths[level]):
global_layer_ix = sum(depths[:level]) + layer
# Norms
for i in range(2):
state_dict[f'ls_transformer_encoder.{level}.{layer}.norm{i+1}.weight'] = torch.tensor(
flax_dict[f'EncoderNDBlock_{global_layer_ix}'][f'LayerNorm_{i}']['scale'])
state_dict[f'ls_transformer_encoder.{level}.{layer}.norm{i+1}.bias'] = torch.tensor(
flax_dict[f'EncoderNDBlock_{global_layer_ix}'][f'LayerNorm_{i}']['bias'])
# Attention qkv
w_q = flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['DenseGeneral_0']['kernel']
w_kv = flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['DenseGeneral_1']['kernel']
# Pay attention to dims here (maybe get pen and paper)
w_kv = np.concatenate(np.split(w_kv, 2, -1), 1)
w_qkv = np.concatenate([w_q, w_kv], 1)
state_dict[f'ls_transformer_encoder.{level}.{layer}.attn.qkv.weight'] = torch.tensor(w_qkv).flatten(1).permute(1,0)
b_q = flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['DenseGeneral_0']['bias']
b_kv = flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['DenseGeneral_1']['bias']
# Pay attention to dims here (maybe get pen and paper)
b_kv = np.concatenate(np.split(b_kv, 2, -1), 0)
b_qkv = np.concatenate([b_q, b_kv], 0)
state_dict[f'ls_transformer_encoder.{level}.{layer}.attn.qkv.bias'] = torch.tensor(b_qkv).reshape(-1)
# Attention proj
w_proj = flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['proj_kernel']
w_proj = torch.tensor(w_proj).permute(2, 1, 0).flatten(1)
state_dict[f'ls_transformer_encoder.{level}.{layer}.attn.proj.weight'] = w_proj
state_dict[f'ls_transformer_encoder.{level}.{layer}.attn.proj.bias'] = torch.tensor(
flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MultiHeadAttention_0']['bias'])
# MLP
for i in range(2):
state_dict[f'ls_transformer_encoder.{level}.{layer}.mlp.fc{i+1}.weight'] = torch.tensor(
flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MlpBlock_0'][f'Dense_{i}']['kernel']).permute(1, 0)
state_dict[f'ls_transformer_encoder.{level}.{layer}.mlp.fc{i+1}.bias'] = torch.tensor(
flax_dict[f'EncoderNDBlock_{global_layer_ix}']['MlpBlock_0'][f'Dense_{i}']['bias'])
# Block aggregations
for level in range(len(depths)-1):
# Convs
state_dict[f'ls_block_aggregation.{level}.conv.weight'] = torch.tensor(
flax_dict[f'ConvPool_{level}']['Conv_0']['kernel']).permute(3, 2, 0, 1)
state_dict[f'ls_block_aggregation.{level}.conv.bias'] = torch.tensor(
flax_dict[f'ConvPool_{level}']['Conv_0']['bias'])
# Norms
state_dict[f'ls_block_aggregation.{level}.norm.weight'] = torch.tensor(
flax_dict[f'ConvPool_{level}']['LayerNorm_0']['scale'])
state_dict[f'ls_block_aggregation.{level}.norm.bias'] = torch.tensor(
flax_dict[f'ConvPool_{level}']['LayerNorm_0']['bias'])
# Final norm
state_dict[f'norm.weight'] = torch.tensor(flax_dict['LayerNorm_0']['scale'])
state_dict[f'norm.bias'] = torch.tensor(flax_dict['LayerNorm_0']['bias'])
# Classifier
state_dict['head.weight'] = torch.tensor(flax_dict['Dense_0']['kernel']).permute(1, 0)
state_dict['head.bias'] = torch.tensor(flax_dict['Dense_0']['bias'])
return state_dict
if __name__ == '__main__':
variant = 'base'
state_dict = convert_nest(f'../nested-transformer/checkpoints/nest-{variant[0]}_imagenet', f'nest_{variant}')
torch.save(state_dict, f'jx_nest_{variant}.pth')