Fix map tensors

pull/35/head
Alan 2 years ago
parent b7143f03c4
commit 3bce0ec707

@ -20,9 +20,10 @@
import sys import sys
import struct import struct
import json import json
import torch
import numpy as np import numpy as np
from transformers import GPT2LMHeadModel from transformers import GPT2Model
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode(): def bytes_to_unicode():
@ -69,8 +70,8 @@ if len(sys.argv) > 2:
use_f16 = False use_f16 = False
fname_out = sys.argv[1] + "/ggml-model-f32.bin" fname_out = sys.argv[1] + "/ggml-model-f32.bin"
model = GPT2LMHeadModel.from_pretrained(dir_model, low_cpu_mem_usage=True) model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
# print (model) #print (model)
list_vars = model.state_dict() list_vars = model.state_dict()
#print (list_vars) #print (list_vars)
@ -83,7 +84,7 @@ fout.write(struct.pack("i", hparams["n_positions"]))
fout.write(struct.pack("i", hparams["n_embd"])) fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"])) fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"])) fout.write(struct.pack("i", hparams["n_layer"]))
# fout.write(struct.pack("i", hparams["rotary_dim"])) #fout.write(struct.pack("i", hparams["rotary_dim"]))
fout.write(struct.pack("i", use_f16)) fout.write(struct.pack("i", use_f16))
byte_encoder = bytes_to_unicode() byte_encoder = bytes_to_unicode()
@ -125,16 +126,8 @@ for name in list_vars.keys():
ftype = 0 ftype = 0
# for efficiency - transpose these matrices: # for efficiency - transpose these matrices:
# "transformer.h.*.mlp.fc_in.weight # "transformer.h.*.mlp.c_proj.weight
# "transformer.h.*.attn.out_proj.weight if name.endswith(".mlp.c_proj.weight"):
# "transformer.h.*.attn.q_proj.weight"
# "transformer.h.*.attn.k_proj.weight"
# "transformer.h.*.attn.v_proj.weight"
if name.endswith(".mlp.fc_in.weight") or \
name.endswith(".attn.out_proj.weight") or \
name.endswith(".attn.q_proj.weight") or \
name.endswith(".attn.k_proj.weight") or \
name.endswith(".attn.v_proj.weight"):
print(" Transposing") print(" Transposing")
data = data.transpose() data = data.transpose()

@ -208,11 +208,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
// map by name // map by name
model.tensors["model/ln_f/g"] = model.ln_f_g; model.tensors["ln_f.weight"] = model.ln_f_g;
model.tensors["model/ln_f/b"] = model.ln_f_b; model.tensors["ln_f.bias"] = model.ln_f_b;
model.tensors["model/wte"] = model.wte; model.tensors["wte.weight"] = model.wte;
model.tensors["model/wpe"] = model.wpe; model.tensors["wpe.weight"] = model.wpe;
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -236,23 +236,23 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name // map by name
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; model.tensors["h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; model.tensors["h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b;
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; model.tensors["h." + std::to_string(i) + ".ln_2.weight"] = layer.ln_2_g;
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; model.tensors["h." + std::to_string(i) + ".ln_2.bias"] = layer.ln_2_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; model.tensors["h." + std::to_string(i) + ".attn.c_attn.weight"] = layer.c_attn_attn_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; model.tensors["h." + std::to_string(i) + ".attn.c_attn.bias"] = layer.c_attn_attn_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; model.tensors["h." + std::to_string(i) + ".attn.c_proj.weight"] = layer.c_attn_proj_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; model.tensors["h." + std::to_string(i) + ".attn.c_proj.bias"] = layer.c_attn_proj_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; model.tensors["h." + std::to_string(i) + ".mlp.c_fc.weight"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; model.tensors["h." + std::to_string(i) + ".mlp.c_fc.bias"] = layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans; model.tensors["h." + std::to_string(i) + ".mlp.c_proj.weight"] = layer.c_mlp_proj_w_trans;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; model.tensors["h." + std::to_string(i) + ".mlp.c_proj.bias"] = layer.c_mlp_proj_b;
} }
} }

Loading…
Cancel
Save