gpt : avoid ggml_transpose on model tensors (new models!)

2 years ago · 86b1e356b0
parent e052167772
commit 86b1e356b0
4 changed files with 55 additions and 35 deletions
--- a/examples/gpt-2/convert-ckpt-to-ggml.py
+++ b/examples/gpt-2/convert-ckpt-to-ggml.py
@ -53,8 +53,10 @@ def convert_to_ftype(data, ftype):
    assert False, "Invalid ftype: " + str(ftype)
-if len(sys.argv) < 2:
+if len(sys.argv) < 3:
-    print("Usage: convert-ckpt-to-ggml.py dir-model [use-f32]\n")
+    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
@ -70,8 +72,6 @@ with open(dir_model + "/hparams.json", "r") as f:
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #   ftype == 2 -> qint4_0
 #   ftype == 3 -> qint4_1
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
@ -113,7 +113,14 @@ for name, shape in list_vars:
    n_dims = len(data.shape);
    # for efficiency - transpose the projection matrices
-    if name[-13:] == "/mlp/c_proj/w":
+    # "model/h.*/attn/c_attn/w"
    # "model/h.*/attn/c_proj/w"
    # "model/h.*/mlp/c_fc/w"
    # "model/h.*/mlp/c_proj/w"
    if name[-14:] == "/attn/c_attn/w" or \
       name[-14:] == "/attn/c_proj/w" or \
       name[-11:] == "/mlp/c_fc/w" or \
       name[-13:] == "/mlp/c_proj/w":
        print("  Transposing")
        data = data.transpose()
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@ -128,7 +128,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        }
    }
-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = GGML_TYPE_COUNT;
    switch (model.hparams.f16) {
@ -237,13 +237,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
            layer.c_attn_proj_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
@ -461,7 +461,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);
            cur = ggml_add(ctx0,
@ -568,7 +568,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -605,7 +605,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);
            cur = ggml_add(ctx0,
--- a/examples/gpt-j/convert-h5-to-ggml.py
+++ b/examples/gpt-j/convert-h5-to-ggml.py
@ -47,8 +47,10 @@ def bytes_to_unicode():
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
-if len(sys.argv) < 2:
+if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
@ -64,11 +66,21 @@ with open(dir_model + "/added_tokens.json", "r") as f:
 with open(dir_model + "/config.json", "r") as f:
    hparams = json.load(f)
-# use 16-bit or 32-bit floats
+# possible data types
-use_f16 = True
+#   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
-    use_f16 = False
+    ftype = int(sys.argv[2])
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
+    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
 #print (model)
@ -85,7 +97,7 @@ fout.write(struct.pack("i", hparams["n_embd"]))
 fout.write(struct.pack("i", hparams["n_head"]))
 fout.write(struct.pack("i", hparams["n_layer"]))
 fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", use_f16))
+fout.write(struct.pack("i", ftype))
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v:k for k, v in byte_encoder.items()}
@ -114,34 +126,35 @@ for name in list_vars.keys():
    n_dims = len(data.shape);
    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
+    ftype_cur = 0;
-    if use_f16:
+    if ftype != 0:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
-            ftype = 1
+            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
-            ftype = 0
+            ftype_cur = 0
    # for efficiency - transpose these matrices:
    # (note - with latest ggml this is no longer more efficient, so disabling it)
    #  "transformer.h.*.mlp.fc_in.weight"
    #  "transformer.h.*.attn.out_proj.weight"
    #  "transformer.h.*.attn.q_proj.weight"
    #  "transformer.h.*.attn.k_proj.weight"
    #  "transformer.h.*.attn.v_proj.weight"
-    if name.endswith(".mlp.fc_in.weight")     or \
+    #if name.endswith(".mlp.fc_in.weight")     or \
-       name.endswith(".attn.out_proj.weight") or \
+    #   name.endswith(".attn.out_proj.weight") or \
-       name.endswith(".attn.q_proj.weight")   or \
+    #   name.endswith(".attn.q_proj.weight")   or \
-       name.endswith(".attn.k_proj.weight")   or \
+    #   name.endswith(".attn.k_proj.weight")   or \
-       name.endswith(".attn.v_proj.weight"):
+    #   name.endswith(".attn.v_proj.weight"):
-        print("  Transposing")
+    #    print("  Transposing")
-        data = data.transpose()
+    #    data = data.transpose()
    # header
    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@ -245,7 +245,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
            layer.c_attn_proj_w         = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_mlp_fc_w            = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_fc_w            = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b            = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
            layer.c_mlp_proj_w_trans    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
@ -459,9 +459,9 @@ bool gptj_eval(
        // self-attention
        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_q_proj_w), cur);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_k_proj_w), cur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, ggml_transpose(ctx0, model.layers[il].c_attn_v_proj_w), cur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
            // store key and value to memory
            if (N >= 1) {
@ -529,7 +529,7 @@ bool gptj_eval(
            // projection (no bias)
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
        }
@ -540,7 +540,7 @@ bool gptj_eval(
        {
            // note here we pass inpSA instead of cur
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    inpSA);
            cur = ggml_add(ctx0,