You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ggml/examples/gpt-2/convert-ckpt-to-ggml.py

235 lines
7.1 KiB

# Convert a model checkpoint to a ggml compatible file
#
# Load the model using TensorFlow.
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#
import sys
import json
import struct
import numpy as np
import tensorflow as tf
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
# helper method to convert a numpy array to different float types
def convert_to_ftype(data, ftype):
# fp16
if ftype == 1:
return data.astype(np.float16)
# qint4_0
# C code:
# {
# for (int l = 0; l < QK; l++) {
# const float v = src[i*QK + l];
# amax = MAX(amax, fabsf(v));
# }
#
# const float d = amax / ((1 << (QB - 1)) - 1);
# const float id = d ? 1.0/d : 0.0;
#
# pd[i] = GGML_FP32_TO_GQ(d);
#
# for (int l = 0; l < QK; l++) {
# const float v = src[i*QK + l]*id;
# const int8_t vi = ((int8_t) (round(v))) + 8;
# assert(vi >= 0 && vi < 16);
# pp[l/2] |= (vi & 0xf) << (4*(l & 1));
# }
#
# memcpy(pb + i*QK/2, pp, sizeof(pp));
# }
if ftype == 2:
assert data.dtype == np.float32
assert data.shape[-1] % 64 == 0
# create 2 new arrays:
# - pd: float32 (lowest dimension is data.shape[-1] // 64)
# - pb: int8
pd = np.zeros(data.shape[:-1] + (data.shape[-1] // 64,), dtype=np.float32)
pb = np.zeros(data.shape[:-1] + (data.shape[-1], ), dtype=np.int8)
# the quantized data goes here
dst = np.zeros((data.size // 64) * (4 + 32), dtype=np.uint8)
print("data:", data.shape, data.size)
print("pd: ", pd.shape, pd.size)
print("pb: ", pb.shape, pb.size)
print("dst: ", dst.shape, dst.size)
for i in range(0, data.shape[-1], 64):
max_abs = np.max(np.abs(data[..., i:i+64]))
max_q = (1 << 3) - 1
d = max_abs / max_q
id = 1.0 / d if d != 0 else 0.0
pd[..., i//64] = d
for j in range(64):
v = data[..., i+j] * id
vi = np.round(v).astype(np.int8) + 8
assert np.all(vi >= 0) and np.all(vi < 16)
#ve = vi[...,(j & 1) == 0].reshape(-1, 1)
#print("ve:", ve.shape, ve)
#print("vo:", vo.shape, vo)
#print("pb:", pb[..., (i+j)//2].shape, pb[..., (i+j)//2])
pb[..., i+j] = vi
# convert to 1D array
pd = pd.reshape(-1, 1)
pb = pb.reshape(-1, 1)
# populate the destination array
n = data.size
for i in range(0, n, 64):
d = pd[i//64][0]
b = pb[i:i+64].reshape(-1)
#print("d:", d)
#print("b:", b)
db = struct.unpack("4B", struct.pack("f", d))
dst[(i//64)*36 + 0] = db[0]
dst[(i//64)*36 + 1] = db[1]
dst[(i//64)*36 + 2] = db[2]
dst[(i//64)*36 + 3] = db[3]
for j in range(32):
dst[(i//64)*36 + 4 + j] = b[j] | (b[j+1] << 4)
return dst
assert False, "Invalid ftype: " + str(ftype)
if len(sys.argv) < 2:
print("Usage: convert-ckpt-to-ggml.py dir-model [use-f32]\n")
sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"
with open(dir_model + "/encoder.json", "r") as f:
encoder = json.load(f)
with open(dir_model + "/hparams.json", "r") as f:
hparams = json.load(f)
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# ftype == 2 -> qint4_0
# ftype == 3 -> qint4_1
#
# map from ftype to string
ftype_str = ["f32", "f16", "q4_0", "q4_1"]
ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 3:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
list_vars = tf.train.list_variables(dir_model)
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["n_vocab"]))
fout.write(struct.pack("i", hparams["n_ctx"]))
fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", ftype))
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
fout.write(struct.pack("i", len(encoder)))
for key in encoder:
text = bytearray([byte_decoder[c] for c in key])
fout.write(struct.pack("i", len(text)))
fout.write(text)
for name, shape in list_vars:
print("Processing variable: " + name + " with shape: ", shape)
data = tf.train.load_variable(dir_model, name).squeeze()
n_dims = len(data.shape);
# for efficiency - transpose the projection matrices
if name[-13:] == "/mlp/c_proj/w":
print(" Transposing")
data = data.transpose()
dshape = data.shape
ftype_cur = 0
if ftype != 0:
# match name:
# "model/wte"
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
if name == "model/wte" or name[-2:] == "/w":
print(" Converting to " + ftype_str[ftype])
data = convert_to_ftype(data, ftype)
ftype_cur = ftype
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(str);
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " + fname_out)
print("")