Fix map tensors

2 years ago · 3bce0ec707
parent b7143f03c4
commit 3bce0ec707
2 changed files with 23 additions and 30 deletions
--- a/examples/gpt-2/convert-h5-to-ggml.py
+++ b/examples/gpt-2/convert-h5-to-ggml.py
@ -20,9 +20,10 @@
 import sys
 import struct
 import json
+import torch
 import numpy as np

-from transformers import GPT2LMHeadModel
+from transformers import GPT2Model

 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
@ -69,8 +70,8 @@ if len(sys.argv) > 2:
    use_f16 = False
    fname_out = sys.argv[1] + "/ggml-model-f32.bin"

-model = GPT2LMHeadModel.from_pretrained(dir_model, low_cpu_mem_usage=True)
-# print (model)
+model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)

 list_vars = model.state_dict()
 #print (list_vars)
@ -83,7 +84,7 @@ fout.write(struct.pack("i", hparams["n_positions"]))
 fout.write(struct.pack("i", hparams["n_embd"]))
 fout.write(struct.pack("i", hparams["n_head"]))
 fout.write(struct.pack("i", hparams["n_layer"]))
-# fout.write(struct.pack("i", hparams["rotary_dim"]))
+#fout.write(struct.pack("i", hparams["rotary_dim"]))
 fout.write(struct.pack("i", use_f16))

 byte_encoder = bytes_to_unicode()
@ -125,16 +126,8 @@ for name in list_vars.keys():
            ftype = 0

    # for efficiency - transpose these matrices:
-    #  "transformer.h.*.mlp.fc_in.weight
-    #  "transformer.h.*.attn.out_proj.weight
-    #  "transformer.h.*.attn.q_proj.weight"
-    #  "transformer.h.*.attn.k_proj.weight"
-    #  "transformer.h.*.attn.v_proj.weight"
-    if name.endswith(".mlp.fc_in.weight")     or \
-       name.endswith(".attn.out_proj.weight") or \
-       name.endswith(".attn.q_proj.weight")   or \
-       name.endswith(".attn.k_proj.weight")   or \
-       name.endswith(".attn.v_proj.weight"):
+    #  "transformer.h.*.mlp.c_proj.weight
+    if name.endswith(".mlp.c_proj.weight"):
        print("  Transposing")
        data = data.transpose()

--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@ -208,11 +208,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);

        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
+        model.tensors["ln_f.weight"] = model.ln_f_g;
+        model.tensors["ln_f.bias"] = model.ln_f_b;

-        model.tensors["model/wte"] = model.wte;
-        model.tensors["model/wpe"] = model.wpe;
+        model.tensors["wte.weight"] = model.wte;
+        model.tensors["wpe.weight"] = model.wpe;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -236,23 +236,23 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.c_mlp_proj_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+            model.tensors["h." + std::to_string(i) + ".ln_1.weight"]      = layer.ln_1_g;
+            model.tensors["h." + std::to_string(i) + ".ln_1.bias"]        = layer.ln_1_b;

-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+            model.tensors["h." + std::to_string(i) + ".ln_2.weight"]      = layer.ln_2_g;
+            model.tensors["h." + std::to_string(i) + ".ln_2.bias"]        = layer.ln_2_b;

-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+            model.tensors["h." + std::to_string(i) + ".attn.c_attn.weight"] = layer.c_attn_attn_w;
+            model.tensors["h." + std::to_string(i) + ".attn.c_attn.bias"] = layer.c_attn_attn_b;

-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+            model.tensors["h." + std::to_string(i) + ".attn.c_proj.weight"] = layer.c_attn_proj_w;
+            model.tensors["h." + std::to_string(i) + ".attn.c_proj.bias"] = layer.c_attn_proj_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+            model.tensors["h." + std::to_string(i) + ".mlp.c_fc.weight"]    = layer.c_mlp_fc_w;
+            model.tensors["h." + std::to_string(i) + ".mlp.c_fc.bias"]    = layer.c_mlp_fc_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+            model.tensors["h." + std::to_string(i) + ".mlp.c_proj.weight"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["h." + std::to_string(i) + ".mlp.c_proj.bias"]  = layer.c_mlp_proj_b;
        }
    }