gpt : avoid ggml_transpose on model tensors (new models!)

gq
Georgi Gerganov 1 year ago
parent e052167772
commit 86b1e356b0
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -53,8 +53,10 @@ def convert_to_ftype(data, ftype):
assert False, "Invalid ftype: " + str(ftype)
if len(sys.argv) < 2:
print("Usage: convert-ckpt-to-ggml.py dir-model [use-f32]\n")
if len(sys.argv) < 3:
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
# output in the same directory as the model
@ -70,8 +72,6 @@ with open(dir_model + "/hparams.json", "r") as f:
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# ftype == 2 -> qint4_0
# ftype == 3 -> qint4_1
#
# map from ftype to string
ftype_str = ["f32", "f16"]
@ -113,7 +113,14 @@ for name, shape in list_vars:
n_dims = len(data.shape);
# for efficiency - transpose the projection matrices
if name[-13:] == "/mlp/c_proj/w":
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
if name[-14:] == "/attn/c_attn/w" or \
name[-14:] == "/attn/c_proj/w" or \
name[-11:] == "/mlp/c_fc/w" or \
name[-13:] == "/mlp/c_proj/w":
print(" Transposing")
data = data.transpose()

@ -128,7 +128,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}
}
// for the big tensors, we have the option to store the data in 16-bit floats
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.f16) {
@ -237,13 +237,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
@ -461,7 +461,7 @@ bool gpt2_eval(
// [2304, N]
{
cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
model.layers[il].c_attn_attn_w,
cur);
cur = ggml_add(ctx0,
@ -568,7 +568,7 @@ bool gpt2_eval(
// [768, N]
{
cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
model.layers[il].c_attn_proj_w,
cur);
cur = ggml_add(ctx0,
@ -605,7 +605,7 @@ bool gpt2_eval(
// cur = fc_w*cur + fc_b
// [3072, N]
cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
model.layers[il].c_mlp_fc_w,
cur);
cur = ggml_add(ctx0,

@ -47,8 +47,10 @@ def bytes_to_unicode():
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
if len(sys.argv) < 2:
if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
# output in the same directory as the model
@ -64,11 +66,21 @@ with open(dir_model + "/added_tokens.json", "r") as f:
with open(dir_model + "/config.json", "r") as f:
hparams = json.load(f)
# use 16-bit or 32-bit floats
use_f16 = True
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 2:
use_f16 = False
fname_out = sys.argv[1] + "/ggml-model-f32.bin"
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
#print (model)
@ -85,7 +97,7 @@ fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", hparams["rotary_dim"]))
fout.write(struct.pack("i", use_f16))
fout.write(struct.pack("i", ftype))
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
@ -114,34 +126,35 @@ for name in list_vars.keys():
n_dims = len(data.shape);
# ftype == 0 -> float32, ftype == 1 -> float16
ftype = 0;
if use_f16:
ftype_cur = 0;
if ftype != 0:
if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
ftype = 1
ftype_cur = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0
ftype_cur = 0
# for efficiency - transpose these matrices:
# (note - with latest ggml this is no longer more efficient, so disabling it)
# "transformer.h.*.mlp.fc_in.weight"
# "transformer.h.*.attn.out_proj.weight"
# "transformer.h.*.attn.q_proj.weight"
# "transformer.h.*.attn.k_proj.weight"
# "transformer.h.*.attn.v_proj.weight"
if name.endswith(".mlp.fc_in.weight") or \
name.endswith(".attn.out_proj.weight") or \
name.endswith(".attn.q_proj.weight") or \
name.endswith(".attn.k_proj.weight") or \
name.endswith(".attn.v_proj.weight"):
print(" Transposing")
data = data.transpose()
#if name.endswith(".mlp.fc_in.weight") or \
# name.endswith(".attn.out_proj.weight") or \
# name.endswith(".attn.q_proj.weight") or \
# name.endswith(".attn.k_proj.weight") or \
# name.endswith(".attn.v_proj.weight"):
# print(" Transposing")
# data = data.transpose()
# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype))
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str);

@ -245,7 +245,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
@ -459,9 +459,9 @@ bool gptj_eval(
// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_q_proj_w), cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_k_proj_w), cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_v_proj_w), cur);
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
// store key and value to memory
if (N >= 1) {
@ -529,7 +529,7 @@ bool gptj_eval(
// projection (no bias)
cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
model.layers[il].c_attn_proj_w,
cur);
}
@ -540,7 +540,7 @@ bool gptj_eval(
{
// note here we pass inpSA instead of cur
cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
model.layers[il].c_mlp_fc_w,
inpSA);
cur = ggml_add(ctx0,

Loading…
Cancel
Save