From 1d38a69d7c39122e0c64b29126ad7f4dfc6c9064 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 2 Jan 2023 16:11:13 +0200 Subject: [PATCH] t5 : initial load in ggml --- examples/t5/convert-flan-t5-pt-to-ggml.py | 4 +- examples/t5/main.cpp | 739 +++++++++++++++++++++- 2 files changed, 741 insertions(+), 2 deletions(-) diff --git a/examples/t5/convert-flan-t5-pt-to-ggml.py b/examples/t5/convert-flan-t5-pt-to-ggml.py index 64b6ed5..88eb82e 100644 --- a/examples/t5/convert-flan-t5-pt-to-ggml.py +++ b/examples/t5/convert-flan-t5-pt-to-ggml.py @@ -7,7 +7,8 @@ import numpy import code # tmp -from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +#from transformers import AutoModelForSeq2SeqLM +from transformers import AutoTokenizer if len(sys.argv) < 3: print("Usage: convert-flan-t5-pt-to-ggml.py path-to-pt-model dir-output [use-f32]\n") @@ -61,6 +62,7 @@ fout.write(struct.pack("i", config["d_model"])) fout.write(struct.pack("i", config["n_positions"])) fout.write(struct.pack("i", config["num_heads"])) fout.write(struct.pack("i", config["num_layers"])) +fout.write(struct.pack("i", use_f16)) # sort tokenizer.vocab by value tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1]) diff --git a/examples/t5/main.cpp b/examples/t5/main.cpp index 33c14ce..a3772a1 100644 --- a/examples/t5/main.cpp +++ b/examples/t5/main.cpp @@ -1,3 +1,740 @@ -int main() { +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// available t5 models +enum e_model { + MODEL_UNKNOWN, + MODEL_SMALL, + MODEL_BASE, + MODEL_LARGE, + MODEL_XL, + MODEL_XXL, +}; + +static const size_t MB = 4*1024*1024; + +static const std::map MEM_REQ_MODEL = { + { MODEL_SMALL, 74ull*MB }, + { MODEL_BASE, 142ull*MB }, + { MODEL_LARGE, 466ull*MB }, + { MODEL_XL, 1464ull*MB }, + { MODEL_XXL, 2952ull*MB }, +}; + +static const std::map MEM_REQ_MEMORY = { + { MODEL_SMALL, 12ull*MB }, + { MODEL_BASE, 24ull*MB }, + { MODEL_LARGE, 70ull*MB }, + { MODEL_XL, 184ull*MB }, + { MODEL_XXL, 306ull*MB }, +}; + +static const std::map MEM_REQ_ENCODE = { + { MODEL_SMALL, 80ull*MB }, + { MODEL_BASE, 128ull*MB }, + { MODEL_LARGE, 300ull*MB }, + { MODEL_XL, 680ull*MB }, + { MODEL_XXL, 1100ull*MB }, +}; + +static const std::map MEM_REQ_ENCODE_LAYER = { + { MODEL_SMALL, 104ull*MB }, + { MODEL_BASE, 138ull*MB }, + { MODEL_LARGE, 208ull*MB }, + { MODEL_XL, 280ull*MB }, + { MODEL_XXL, 354ull*MB }, +}; + +static const std::map MEM_REQ_DECODE = { + { MODEL_SMALL, 200ull*MB }, + { MODEL_BASE, 202ull*MB }, + { MODEL_LARGE, 204ull*MB }, + { MODEL_XL, 206ull*MB }, + { MODEL_XXL, 208ull*MB }, +}; + +static const std::map MEM_REQ_DECODE_LAYER = { + { MODEL_SMALL, 32ull*MB }, + { MODEL_BASE, 44ull*MB }, + { MODEL_LARGE, 64ull*MB }, + { MODEL_XL, 84ull*MB }, + { MODEL_XXL, 110ull*MB }, +}; + +struct t5_vocab { + using id = int32_t; + using token = std::string; + + int n_vocab = 32128; + + std::map token_to_id; + std::map id_to_token; +}; + +// default hparams (FLAN-T5 Small) +struct t5_hparams { + int32_t n_vocab = 32128; + int32_t d_ff = 1024; + int32_t d_kv = 64; + int32_t d_model = 512; + int32_t n_positions = 512; + int32_t n_head = 6; + int32_t n_layer = 8; + int32_t f16 = 1; +}; + +struct t5_layer_encoder { + // encoder.block.*.layer.0.SelfAttention + struct ggml_tensor * attn_q; + struct ggml_tensor * attn_k; + struct ggml_tensor * attn_v; + struct ggml_tensor * attn_o; + + // encoder.blocks.*.layer.0.layer_norm + struct ggml_tensor * ln_0; + + // encoder.blocks.*.layer.1.DenseReluDense + struct ggml_tensor * wi_0; + struct ggml_tensor * wi_1; + struct ggml_tensor * wo; + + // encoder.blocks.*.layer.1.layer_norm + struct ggml_tensor * ln_1; +}; + +struct t5_layer_decoder { + // decoder.block.*.layer.0.SelfAttention + struct ggml_tensor * attn_q; + struct ggml_tensor * attn_k; + struct ggml_tensor * attn_v; + struct ggml_tensor * attn_o; + + // decoder.blocks.*.layer.0.layer_norm + struct ggml_tensor * ln_0; + + // decoder.blocks.*.layer.1.EncDecAttention + struct ggml_tensor * cross_attn_q; + struct ggml_tensor * cross_attn_k; + struct ggml_tensor * cross_attn_v; + struct ggml_tensor * cross_attn_o; + + // decoder.blocks.*.layer.1.layer_norm + struct ggml_tensor * ln_1; + + // decoder.blocks.*.layer.1.DenseReluDense + struct ggml_tensor * wi_0; + struct ggml_tensor * wi_1; + struct ggml_tensor * wo; + + // decoder.blocks.*.layer.1.layer_norm + struct ggml_tensor * ln_2; +}; + +struct t5_model { + e_model type = MODEL_UNKNOWN; + + t5_hparams hparams; + + // shared + struct ggml_tensor * shared; + + // encoder.embed_tokens + struct ggml_tensor * e_et; + + // encoder.final_layer_norm + struct ggml_tensor * e_ln; + + // encoder.block.0.layer.0.SelfAttention.relative_attention_bias + struct ggml_tensor * e_rab; + + // decoder.embed_tokens + struct ggml_tensor * d_et; + + // decoder.final_layer_norm + struct ggml_tensor * d_ln; + + // decoder.block.0.layer.0.SelfAttention.relative_attention_bias + struct ggml_tensor * d_rab; + + // lm_head + struct ggml_tensor * lm_head; + + std::vector layers_encoder; + std::vector layers_decoder; + + // context + struct ggml_context * ctx; + struct ggml_context * ctx_mem; + + // tensors + int n_loaded; + std::map tensors; +}; + +struct t5_context { + int64_t t_load_us = 0; + int64_t t_sample_us = 0; + int64_t t_encode_us = 0; + int64_t t_decode_us = 0; + int64_t t_start_us = 0; + + std::vector buf_model; + std::vector buf_memory; + std::vector buf_compute; + std::vector buf_compute_layer; + + t5_model model; + t5_vocab vocab; + + std::vector probs; + std::vector logits; +}; + +template +static void read_safe(std::ifstream& fin, T& dest) { + fin.read((char*)& dest, sizeof(T)); +} + +static bool t5_model_load(const std::string & fname, t5_context & wctx) { + fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto & model = wctx.model; + auto & vocab = wctx.vocab; + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + read_safe(fin, magic); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + //load hparams + { + auto & hparams = model.hparams; + + read_safe(fin, hparams.n_vocab); + read_safe(fin, hparams.d_ff); + read_safe(fin, hparams.d_kv); + read_safe(fin, hparams.d_model); + read_safe(fin, hparams.n_positions); + read_safe(fin, hparams.n_head); + read_safe(fin, hparams.n_layer); + read_safe(fin, hparams.f16); + + assert(hparams.n_text_state == hparams.n_audio_state); + + if (hparams.n_layer == 8) { + model.type = e_model::MODEL_SMALL; + } + + if (hparams.n_layer == 12) { + model.type = e_model::MODEL_BASE; + } + + if (hparams.n_layer == 24 && hparams.n_head == 16) { + model.type = e_model::MODEL_LARGE; + } + + if (hparams.n_layer == 24 && hparams.n_head == 32) { + model.type = e_model::MODEL_XL; + } + + if (hparams.n_layer == 24 && hparams.n_head == 64) { + model.type = e_model::MODEL_XXL; + } + + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: d_ff = %d\n", __func__, hparams.d_ff); + fprintf(stderr, "%s: d_kv = %d\n", __func__, hparams.d_kv); + fprintf(stderr, "%s: d_model = %d\n", __func__, hparams.d_model); + fprintf(stderr, "%s: n_positions = %d\n", __func__, hparams.n_positions); + fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: type = %d\n", __func__, model.type); + + wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type)); + wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type)); + wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type))); + wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type))); + } + + // load vocab + { + int32_t n_vocab = 0; + read_safe(fin, n_vocab); + + //if (n_vocab != model.hparams.n_vocab) { + // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + // return false; + //} + + std::string word; + std::vector tmp; + + tmp.reserve(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + read_safe(fin, len); + + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + word.assign(&tmp[0], tmp.size()); + } else { + // seems like we have an empty-string token in multi-language models (i = 50256) + //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i); + word = ""; + } + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + } + + vocab.n_vocab = model.hparams.n_vocab; + + if (n_vocab < model.hparams.n_vocab) { + fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); + for (int i = n_vocab; i < model.hparams.n_vocab; i++) { + word = "[_extra_token_" + std::to_string(i) + "]"; + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + wctx.logits.reserve(vocab.n_vocab*model.hparams.d_model); + wctx.probs.reserve(vocab.n_vocab*model.hparams.d_model); + } + + { + // this is the total memory required to run the inference + const size_t mem_required = + wctx.buf_model.size() + + wctx.buf_memory.size() + + wctx.buf_compute.size() + + wctx.buf_compute_layer.size(); + + fprintf(stderr, "%s: mem_required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + } + + // for the big tensors, we have the option to store the data in 16-bit floats + // in order to save memory and also to speed up the computation + const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + const int d_ff = hparams.d_ff; + const int d_kv = hparams.d_kv; + const int d_model = hparams.d_model; + const int n_head = hparams.n_head; + const int n_layer = hparams.n_layer; + + ctx_size += n_vocab*d_model*ggml_type_size(wtype); // shared; + ctx_size += n_vocab*d_model*ggml_type_size(wtype); // lm_head; + + // encoder + { + ctx_size += n_vocab*d_model*ggml_type_size(wtype); // e_et; + ctx_size += d_model*ggml_type_size(GGML_TYPE_F32); // e_ln + ctx_size += 32*n_head*ggml_type_size(wtype); // e_rab + } + + // decoder + { + ctx_size += n_vocab*d_model*ggml_type_size(wtype); // d_et; + ctx_size += d_model*ggml_type_size(GGML_TYPE_F32); // d_ln + ctx_size += 32*n_head*ggml_type_size(wtype); // d_rab + } + + // encoder layers + { + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o + + ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0 + + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0 + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1 + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo + + ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1 + } + + // decoder layers + { + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o + + ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0 + + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_q + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_k + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_v + ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_o + + ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1 + + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0 + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1 + ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo + + ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_2 + } + + ctx_size += (15 + 9*n_layer + 14*n_layer)*256; // object overhead + + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params; + params.mem_size = wctx.buf_model.size(); + params.mem_buffer = wctx.buf_model.data(); + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + auto & ctx = model.ctx; + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + const int d_ff = hparams.d_ff; + const int d_kv = hparams.d_kv; + const int d_model = hparams.d_model; + const int n_head = hparams.n_head; + const int n_layer = hparams.n_layer; + + model.layers_encoder.resize(n_layer); + model.layers_decoder.resize(n_layer); + + // global + { + model.shared = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab); + + model.tensors["shared.weight"] = model.shared; + model.tensors["lm_head.weight"] = model.lm_head; + } + + // encoder + { + model.e_et = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab); + model.e_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + model.e_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32); + + // map by name + model.tensors["encoder.embed_tokens.weight"] = model.e_et; + model.tensors["encoder.final_layer_norm.weight"] = model.e_ln; + + model.tensors["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.e_rab; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers_encoder[i]; + + layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model); + + layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff); + layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff); + layer.wo = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model); + + layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + // map by name + model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q; + model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k; + model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v; + model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o; + + model.tensors["encoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0; + + model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_0.weight"] = layer.wi_0; + model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_1.weight"] = layer.wi_1; + model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wo.weight"] = layer.wo; + + model.tensors["encoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1; + } + } + + // decoder + { + model.d_et = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab); + model.d_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + model.d_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32); + + // map by name + model.tensors["decoder.embed_tokens.weight"] = model.d_et; + model.tensors["decoder.final_layer_norm.weight"] = model.d_ln; + + model.tensors["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.d_rab; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers_decoder[i]; + + layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model); + + layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + layer.cross_attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.cross_attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.cross_attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head); + layer.cross_attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model); + + layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff); + layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff); + layer.wo = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model); + + layer.ln_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model); + + // map by name + model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q; + model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k; + model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v; + model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o; + + model.tensors["decoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0; + + model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.q.weight"] = layer.cross_attn_q; + model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.k.weight"] = layer.cross_attn_k; + model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.v.weight"] = layer.cross_attn_v; + model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.o.weight"] = layer.cross_attn_o; + + model.tensors["decoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1; + + model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_0.weight"] = layer.wi_0; + model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_1.weight"] = layer.wi_1; + model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wo.weight"] = layer.wo; + + model.tensors["decoder.block." + std::to_string(i) + ".layer.2.layer_norm.weight"] = layer.ln_2; + } + } + } + + // create the ggml memory context + { + struct ggml_init_params params; + params.mem_size = wctx.buf_memory.size(); + params.mem_buffer = wctx.buf_memory.data(); + + model.ctx_mem = ggml_init(params); + if (!model.ctx_mem) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // key + value memory + //{ + // auto & ctx = model.ctx_mem; + + // const auto & hparams = model.hparams; + + // const int n_text_state = hparams.n_text_state; + // const int n_text_layer = hparams.n_text_layer; + // const int n_text_ctx = hparams.n_text_ctx; + + // // key/value memory for the self-attention layer + // { + // const int n_mem = n_text_layer*n_text_ctx; + // const int n_elements = n_text_state*n_mem; + + // model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + // model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + // } + + // // key/value memory for the cross-attention layer + // { + // const int n_audio_ctx = hparams.n_audio_ctx; + + // const int n_mem = n_text_layer*n_audio_ctx; + // const int n_elements = n_text_state*n_mem; + + // model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + // model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + // } + + // const size_t memory_size = + // ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) + + // ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v); + + // fprintf(stderr, "%s: memory size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0); + //} + + // load weights + { + size_t total_size = 0; + + model.n_loaded = 0; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + read_safe(fin, n_dims); + read_safe(fin, length); + read_safe(fin, ftype); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[3] = { 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + read_safe(fin, ne[i]); + nelements *= ne[i]; + } + + std::string name; + std::vector tmp(length); // create a buffer + fin.read(&tmp[0], tmp.size()); // read to buffer + name.assign(&tmp[0], tmp.size()); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]); + return false; + } + + const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); + + if (nelements*bpe != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_nbytes(tensor); + model.n_loaded++; + } + + fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0); + + if (model.n_loaded == 0) { + fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); + } else if (model.n_loaded != (int) model.tensors.size()) { + fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); + return false; + } + } + + fin.close(); + + return true; +} + +struct t5_context * t5_init(const char * path_model) { + ggml_time_init(); + + t5_context * ctx = new t5_context; + + const int64_t t_start_us = ggml_time_us(); + + ctx->t_start_us = t_start_us; + + if (!t5_model_load(path_model, *ctx)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model); + delete ctx; + return nullptr; + } + + ctx->t_load_us = ggml_time_us() - t_start_us; + + return ctx; +} + +void t5_free(struct t5_context * ctx) { + if (ctx) { + if (ctx->model.ctx) { + ggml_free(ctx->model.ctx); + } + if (ctx->model.ctx_mem) { + ggml_free(ctx->model.ctx_mem); + } + delete ctx; + } +} + +int main(int argc, char ** argv) { + if (argc < 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return -1; + } + + const char * path_model = argv[1]; + + t5_context * ctx = t5_init(path_model); + if (!ctx) { + fprintf(stderr, "%s: failed to initialize T5 context\n", __func__); + return -1; + } + + fprintf(stderr, "%s: model loaded in %7.2f ms\n", __func__, ctx->t_load_us/1000.0); + + t5_free(ctx); + return 0; }