From c2e9635c794d0a1ac73eb3e473ad7b12edc71138 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 10 Mar 2023 17:58:48 +0200 Subject: [PATCH] llama : initial model conversion to ggml format --- examples/llama/convert-pth-to-ggml.py | 128 ++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 examples/llama/convert-pth-to-ggml.py diff --git a/examples/llama/convert-pth-to-ggml.py b/examples/llama/convert-pth-to-ggml.py new file mode 100644 index 0000000..a966ce3 --- /dev/null +++ b/examples/llama/convert-pth-to-ggml.py @@ -0,0 +1,128 @@ +# Convert a LLaMA model checkpoint to a ggml compatible file +# +# Load the model using Torch +# Iterate over all variables and write them to a binary file. +# +# For each variable, write the following: +# - Number of dimensions (int) +# - Name length (int) +# - Dimensions (int[n_dims]) +# - Name (char[name_length]) +# - Data (float[n_dims]) +# +# By default, the bigger matrices are converted to 16-bit floats. +# This can be disabled by adding the "use-f32" CLI argument. +# +# At the start of the ggml file we write the model parameters +# and vocabulary. +# + +import sys +import json +import struct +import numpy as np +import torch + +from sentencepiece import SentencePieceProcessor + +if len(sys.argv) < 3: + print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n") + print(" ftype == 0 -> float32") + print(" ftype == 1 -> float16") + sys.exit(1) + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model.bin" + +fname_hparams = sys.argv[1] + "/params.json" +fname_model = sys.argv[1] + "/consolidated.00.pth" +fname_tokenizer = sys.argv[1] + "/../tokenizer.model" + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if len(sys.argv) > 2: + ftype = int(sys.argv[2]) + if ftype < 0 or ftype > 1: + print("Invalid ftype: " + str(ftype)) + sys.exit(1) + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + +with open(fname_hparams, "r") as f: + hparams = json.load(f) + +tokenizer = SentencePieceProcessor(fname_tokenizer) + +hparams.update({"vocab_size": tokenizer.vocab_size()}) + +print(hparams) + +model = torch.load(fname_model, map_location="cpu") + +fout = open(fname_out, "wb") + +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["vocab_size"])) +fout.write(struct.pack("i", hparams["dim"])) +fout.write(struct.pack("i", hparams["multiple_of"])) +fout.write(struct.pack("i", hparams["n_heads"])) +fout.write(struct.pack("i", hparams["n_layers"])) +fout.write(struct.pack("i", ftype)) + +# Is this correct?? +for i in range(32000): + text = tokenizer.decode(i) + fout.write(struct.pack("i", len(text))) + fout.write(text.encode('utf-8')) + +for k, v in model.items(): + name = k + shape = v.shape + + print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) + + #data = tf.train.load_variable(dir_model, name).squeeze() + data = v.numpy().squeeze() + n_dims = len(data.shape); + + # for efficiency - transpose some matrices + # "model/h.*/attn/c_attn/w" + # "model/h.*/attn/c_proj/w" + # "model/h.*/mlp/c_fc/w" + # "model/h.*/mlp/c_proj/w" + #if name[-14:] == "/attn/c_attn/w" or \ + # name[-14:] == "/attn/c_proj/w" or \ + # name[-11:] == "/mlp/c_fc/w" or \ + # name[-13:] == "/mlp/c_proj/w": + # print(" Transposing") + # data = data.transpose() + + dshape = data.shape + + # default type is fp16 + ftype_cur = 1 + if ftype == 0: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", dshape[n_dims - 1 - i])) + fout.write(str); + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("")