gpt : avoid ggml_transpose on model tensors (new models!)

gq
Georgi Gerganov 2 years ago
parent e052167772
commit 86b1e356b0
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -53,8 +53,10 @@ def convert_to_ftype(data, ftype):
assert False, "Invalid ftype: " + str(ftype) assert False, "Invalid ftype: " + str(ftype)
if len(sys.argv) < 2: if len(sys.argv) < 3:
print("Usage: convert-ckpt-to-ggml.py dir-model [use-f32]\n") print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1) sys.exit(1)
# output in the same directory as the model # output in the same directory as the model
@ -70,8 +72,6 @@ with open(dir_model + "/hparams.json", "r") as f:
# possible data types # possible data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
# ftype == 2 -> qint4_0
# ftype == 3 -> qint4_1
# #
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
@ -113,7 +113,14 @@ for name, shape in list_vars:
n_dims = len(data.shape); n_dims = len(data.shape);
# for efficiency - transpose the projection matrices # for efficiency - transpose the projection matrices
if name[-13:] == "/mlp/c_proj/w": # "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
if name[-14:] == "/attn/c_attn/w" or \
name[-14:] == "/attn/c_proj/w" or \
name[-11:] == "/mlp/c_fc/w" or \
name[-13:] == "/mlp/c_proj/w":
print(" Transposing") print(" Transposing")
data = data.transpose() data = data.transpose()

@ -128,7 +128,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
} }
} }
// for the big tensors, we have the option to store the data in 16-bit floats // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation // in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT; ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.f16) { switch (model.hparams.f16) {
@ -237,13 +237,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd); layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
@ -461,7 +461,7 @@ bool gpt2_eval(
// [2304, N] // [2304, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w), model.layers[il].c_attn_attn_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -568,7 +568,7 @@ bool gpt2_eval(
// [768, N] // [768, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w), model.layers[il].c_attn_proj_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -605,7 +605,7 @@ bool gpt2_eval(
// cur = fc_w*cur + fc_b // cur = fc_w*cur + fc_b
// [3072, N] // [3072, N]
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w), model.layers[il].c_mlp_fc_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,

@ -47,8 +47,10 @@ def bytes_to_unicode():
cs = [chr(n) for n in cs] cs = [chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
if len(sys.argv) < 2: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1) sys.exit(1)
# output in the same directory as the model # output in the same directory as the model
@ -64,11 +66,21 @@ with open(dir_model + "/added_tokens.json", "r") as f:
with open(dir_model + "/config.json", "r") as f: with open(dir_model + "/config.json", "r") as f:
hparams = json.load(f) hparams = json.load(f)
# use 16-bit or 32-bit floats # possible data types
use_f16 = True # ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 2: if len(sys.argv) > 2:
use_f16 = False ftype = int(sys.argv[2])
fname_out = sys.argv[1] + "/ggml-model-f32.bin" if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
#print (model) #print (model)
@ -85,7 +97,7 @@ fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"])) fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"])) fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", hparams["rotary_dim"])) fout.write(struct.pack("i", hparams["rotary_dim"]))
fout.write(struct.pack("i", use_f16)) fout.write(struct.pack("i", ftype))
byte_encoder = bytes_to_unicode() byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()} byte_decoder = {v:k for k, v in byte_encoder.items()}
@ -114,34 +126,35 @@ for name in list_vars.keys():
n_dims = len(data.shape); n_dims = len(data.shape);
# ftype == 0 -> float32, ftype == 1 -> float16 # ftype == 0 -> float32, ftype == 1 -> float16
ftype = 0; ftype_cur = 0;
if use_f16: if ftype != 0:
if name[-7:] == ".weight" and n_dims == 2: if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16") print(" Converting to float16")
data = data.astype(np.float16) data = data.astype(np.float16)
ftype = 1 ftype_cur = 1
else: else:
print(" Converting to float32") print(" Converting to float32")
data = data.astype(np.float32) data = data.astype(np.float32)
ftype = 0 ftype_cur = 0
# for efficiency - transpose these matrices: # for efficiency - transpose these matrices:
# (note - with latest ggml this is no longer more efficient, so disabling it)
# "transformer.h.*.mlp.fc_in.weight" # "transformer.h.*.mlp.fc_in.weight"
# "transformer.h.*.attn.out_proj.weight" # "transformer.h.*.attn.out_proj.weight"
# "transformer.h.*.attn.q_proj.weight" # "transformer.h.*.attn.q_proj.weight"
# "transformer.h.*.attn.k_proj.weight" # "transformer.h.*.attn.k_proj.weight"
# "transformer.h.*.attn.v_proj.weight" # "transformer.h.*.attn.v_proj.weight"
if name.endswith(".mlp.fc_in.weight") or \ #if name.endswith(".mlp.fc_in.weight") or \
name.endswith(".attn.out_proj.weight") or \ # name.endswith(".attn.out_proj.weight") or \
name.endswith(".attn.q_proj.weight") or \ # name.endswith(".attn.q_proj.weight") or \
name.endswith(".attn.k_proj.weight") or \ # name.endswith(".attn.k_proj.weight") or \
name.endswith(".attn.v_proj.weight"): # name.endswith(".attn.v_proj.weight"):
print(" Transposing") # print(" Transposing")
data = data.transpose() # data = data.transpose()
# header # header
str = name.encode('utf-8') str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype)) fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims): for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str); fout.write(str);

@ -245,7 +245,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
@ -459,9 +459,9 @@ bool gptj_eval(
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_q_proj_w), cur); struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_k_proj_w), cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_v_proj_w), cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
// store key and value to memory // store key and value to memory
if (N >= 1) { if (N >= 1) {
@ -529,7 +529,7 @@ bool gptj_eval(
// projection (no bias) // projection (no bias)
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w), model.layers[il].c_attn_proj_w,
cur); cur);
} }
@ -540,7 +540,7 @@ bool gptj_eval(
{ {
// note here we pass inpSA instead of cur // note here we pass inpSA instead of cur
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w), model.layers[il].c_mlp_fc_w,
inpSA); inpSA);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,

Loading…
Cancel
Save