t5 : initial load in ggml

t5 : initial ggml conversion of the model
t5 : add example for text-to-text transfer transformer inference
26 changed files with 2519 additions and 4162 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,4 +9,3 @@ compile_commands.json
 .DS_Store

 src/arm_neon.h
-tests/arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -47,7 +47,6 @@ endif()

 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")

 # dependencies

--- a/README.md
+++ b/README.md
@ -2,9 +2,6 @@

 Tensor library for machine learning

-***Note that this project is under development and not ready for production use. \
-Some of the development is currently happening in the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repo***
-
 ## Features

 - Written in C
@ -16,15 +13,9 @@ Some of the development is currently happening in the [whisper.cpp](https://gith
 - No third-party dependencies
 - Zero memory allocations during runtime

-## Roadmap
-
- [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
- [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
- [ ] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
- [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
- [ ] Example of LLaMA inference
- [ ] Example of RWKV inference
+***Note that this project is under development and not ready for production use.
+Most of the development is currently happening in the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repo,
+so if you are interested in this project, make sure to follow what is happening there.***

 ## Whisper inference (example)

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -4,3 +4,4 @@ target_include_directories(ggml_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 add_subdirectory(gpt-2)
 add_subdirectory(gpt-j)
 add_subdirectory(whisper)
+add_subdirectory(t5)
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@ -347,7 +347,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
+//   - embd_w:    the predicted probabilities of the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -496,6 +496,7 @@ bool gpt2_eval(
                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
                        );

+#if 0
            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@ -503,6 +504,15 @@ bool gpt2_eval(
            // KQ = soft_max(KQ_masked)
            // [n_past + N, N, 12]
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+#else
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+#endif

            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
@ -627,7 +637,7 @@ bool gpt2_eval(
    inpL = ggml_mul_mat(ctx0, model.wte, inpL);

    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
+    inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -641,7 +651,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    // return result just for the last token
+    // return result for just the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

@ -698,7 +708,7 @@ int main(int argc, char ** argv) {
    int64_t t_sample_us  = 0;
    int64_t t_predict_us = 0;

-    std::vector<float> logits;
+    std::vector<float> embd_w;

    // tokenize the prompt
    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
@ -714,14 +724,14 @@ int main(int argc, char ** argv) {

    // determine the required inference memory per token:
    size_t mem_per_token = 0;
-    gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, embd_w, mem_per_token);

    for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
        // predict
        if (embd.size() > 0) {
            const int64_t t_start_us = ggml_time_us();

-            if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!gpt2_eval(model, params.n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("Failed to predict\n");
                return 1;
            }
@ -745,7 +755,7 @@ int main(int argc, char ** argv) {
            {
                const int64_t t_start_sample_us = ggml_time_us();

-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+                id = gpt_sample_top_k_top_p(vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, rng);

                t_sample_us += ggml_time_us() - t_start_sample_us;
            }
--- a/examples/gpt-j/README.md
+++ b/examples/gpt-j/README.md
@ -214,11 +214,8 @@ make -j4 gpt-j
 ```

 To run the `gpt-j` tool, you need the 12GB `ggml-model.bin` file which contains the GPT-J model in
-[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, the binary file
-is downloaded from my repository on Hugging Face using the [download-ggml-model.sh](download-ggml-model.sh) script.
-You can also, download the file manually from this link:
-
-https://huggingface.co/datasets/ggerganov/ggml/tree/main
+[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, I download the binary file
+directly from one of my servers, using the [download-ggml-model.sh](download-ggml-model.sh) script.

 ---

--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@ -355,7 +355,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
+//   - embd_w:    the predicted probabilities of the next token
 //
 // The GPT-J model requires about 16MB of memory per input token.
 //
@ -559,7 +559,7 @@ bool gptj_eval(
    }

    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
+    inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
    int64_t t_sample_us  = 0;
    int64_t t_predict_us = 0;

-    std::vector<float> logits;
+    std::vector<float> embd_w;

    // tokenize the prompt
    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
@ -644,14 +644,14 @@ int main(int argc, char ** argv) {

    // determine the required inference memory per token:
    size_t mem_per_token = 0;
-    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, embd_w, mem_per_token);

    for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
        // predict
        if (embd.size() > 0) {
            const int64_t t_start_us = ggml_time_us();

-            if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!gptj_eval(model, params.n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("Failed to predict\n");
                return 1;
            }
@ -675,7 +675,7 @@ int main(int argc, char ** argv) {
            {
                const int64_t t_start_sample_us = ggml_time_us();

-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+                id = gpt_sample_top_k_top_p(vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, rng);

                t_sample_us += ggml_time_us() - t_start_sample_us;
            }
--- a/examples/t5/CMakeLists.txt
+++ b/examples/t5/CMakeLists.txt
@ -0,0 +1,6 @@
+#
+# t5
+
+set(TEST_TARGET t5)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml ggml_utils)
--- a/examples/t5/README.md
+++ b/examples/t5/README.md
@ -0,0 +1,3 @@
+# t5
+
+ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
--- a/examples/t5/convert-flan-t5-pt-to-ggml.py
+++ b/examples/t5/convert-flan-t5-pt-to-ggml.py
@ -0,0 +1,119 @@
+import io
+import sys
+import torch
+import json
+import struct
+import numpy
+
+import code # tmp
+
+#from transformers import AutoModelForSeq2SeqLM
+from transformers import AutoTokenizer
+
+if len(sys.argv) < 3:
+    print("Usage: convert-flan-t5-pt-to-ggml.py path-to-pt-model dir-output [use-f32]\n")
+    sys.exit(1)
+
+dir_inp = sys.argv[1]
+dir_out = sys.argv[2]
+
+fname_inp = dir_inp + "/pytorch_model.bin"
+fname_out = dir_out + "/ggml-t5-model.bin"
+
+fname_config = dir_inp + "/config.json"
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 3:
+    use_f16 = False
+    fname_out = dir_out + "/ggml-t5-model-f32.bin"
+
+# load torch model
+try:
+    model_bytes = open(fname_inp, "rb").read()
+    with io.BytesIO(model_bytes) as fp:
+        checkpoint = torch.load(fp, map_location="cpu")
+except:
+    print("Error: failed to load PyTorch model file: %s" % fname_inp)
+    sys.exit(1)
+
+# load config (json)
+config = json.load(open(fname_config, "r"))
+
+# list all keys
+for k in checkpoint.keys():
+    print(k)
+
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+
+# list methods of tokenizer
+for m in dir(tokenizer):
+    print(m)
+
+print(config)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", config["vocab_size"]))
+fout.write(struct.pack("i", config["d_ff"]))
+fout.write(struct.pack("i", config["d_kv"]))
+fout.write(struct.pack("i", config["d_model"]))
+fout.write(struct.pack("i", config["n_positions"]))
+fout.write(struct.pack("i", config["num_heads"]))
+fout.write(struct.pack("i", config["num_layers"]))
+fout.write(struct.pack("i", use_f16))
+
+# sort tokenizer.vocab by value
+tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1])
+fout.write(struct.pack("i", len(tokens)))
+
+print("tokens: %d" % len(tokens))
+
+for key in tokens:
+    # TODO: this probably is wrong, but it should work for english at least
+    token = key[0].replace("▁", " ")
+    text = bytearray(token, "utf-8")
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+# tokenize "hello world"
+#print(tokenizer.encode("Hello hello world.Hello-Hello"))
+#print(tokenizer("добър ден", return_tensors="pt"))
+
+# dump weights
+for k in checkpoint.keys():
+    data = checkpoint[k].squeeze().numpy()
+
+    name = k
+    n_dims = len(data.shape)
+    print(name, n_dims, data.shape)
+
+    ftype = 1;
+    if use_f16:
+        if n_dims < 2:
+            print("  Converting to float32")
+            ftype = 0
+        else:
+            print("  Converting to float16")
+            data = data.astype(numpy.float16)
+            ftype = 1
+    else:
+        ftype = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
+
+#code.interact(local=locals())
--- a/examples/t5/main.cpp
+++ b/examples/t5/main.cpp
@ -0,0 +1,740 @@
+#include "ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+// available t5 models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_SMALL,
+    MODEL_BASE,
+    MODEL_LARGE,
+    MODEL_XL,
+    MODEL_XXL,
+};
+
+static const size_t MB = 4*1024*1024;
+
+static const std::map<e_model, size_t> MEM_REQ_MODEL = {
+    { MODEL_SMALL,   74ull*MB },
+    { MODEL_BASE,   142ull*MB },
+    { MODEL_LARGE,  466ull*MB },
+    { MODEL_XL,    1464ull*MB },
+    { MODEL_XXL,   2952ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_MEMORY = {
+    { MODEL_SMALL,  12ull*MB },
+    { MODEL_BASE,   24ull*MB },
+    { MODEL_LARGE,  70ull*MB },
+    { MODEL_XL,    184ull*MB },
+    { MODEL_XXL,   306ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
+    { MODEL_SMALL,   80ull*MB },
+    { MODEL_BASE,   128ull*MB },
+    { MODEL_LARGE,  300ull*MB },
+    { MODEL_XL,     680ull*MB },
+    { MODEL_XXL,   1100ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
+    { MODEL_SMALL, 104ull*MB },
+    { MODEL_BASE,  138ull*MB },
+    { MODEL_LARGE, 208ull*MB },
+    { MODEL_XL,    280ull*MB },
+    { MODEL_XXL,   354ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_DECODE = {
+    { MODEL_SMALL, 200ull*MB },
+    { MODEL_BASE,  202ull*MB },
+    { MODEL_LARGE, 204ull*MB },
+    { MODEL_XL,    206ull*MB },
+    { MODEL_XXL,   208ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_DECODE_LAYER = {
+    { MODEL_SMALL,  32ull*MB },
+    { MODEL_BASE,   44ull*MB },
+    { MODEL_LARGE,  64ull*MB },
+    { MODEL_XL,     84ull*MB },
+    { MODEL_XXL,   110ull*MB },
+};
+
+struct t5_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    int n_vocab = 32128;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+};
+
+// default hparams (FLAN-T5 Small)
+struct t5_hparams {
+    int32_t n_vocab     = 32128;
+    int32_t d_ff        = 1024;
+    int32_t d_kv        = 64;
+    int32_t d_model     = 512;
+    int32_t n_positions = 512;
+    int32_t n_head      = 6;
+    int32_t n_layer     = 8;
+    int32_t f16         = 1;
+};
+
+struct t5_layer_encoder {
+    // encoder.block.*.layer.0.SelfAttention
+    struct ggml_tensor * attn_q;
+    struct ggml_tensor * attn_k;
+    struct ggml_tensor * attn_v;
+    struct ggml_tensor * attn_o;
+
+    // encoder.blocks.*.layer.0.layer_norm
+    struct ggml_tensor * ln_0;
+
+    // encoder.blocks.*.layer.1.DenseReluDense
+    struct ggml_tensor * wi_0;
+    struct ggml_tensor * wi_1;
+    struct ggml_tensor * wo;
+
+    // encoder.blocks.*.layer.1.layer_norm
+    struct ggml_tensor * ln_1;
+};
+
+struct t5_layer_decoder {
+    // decoder.block.*.layer.0.SelfAttention
+    struct ggml_tensor * attn_q;
+    struct ggml_tensor * attn_k;
+    struct ggml_tensor * attn_v;
+    struct ggml_tensor * attn_o;
+
+    // decoder.blocks.*.layer.0.layer_norm
+    struct ggml_tensor * ln_0;
+
+    // decoder.blocks.*.layer.1.EncDecAttention
+    struct ggml_tensor * cross_attn_q;
+    struct ggml_tensor * cross_attn_k;
+    struct ggml_tensor * cross_attn_v;
+    struct ggml_tensor * cross_attn_o;
+
+    // decoder.blocks.*.layer.1.layer_norm
+    struct ggml_tensor * ln_1;
+
+    // decoder.blocks.*.layer.1.DenseReluDense
+    struct ggml_tensor * wi_0;
+    struct ggml_tensor * wi_1;
+    struct ggml_tensor * wo;
+
+    // decoder.blocks.*.layer.1.layer_norm
+    struct ggml_tensor * ln_2;
+};
+
+struct t5_model {
+    e_model type = MODEL_UNKNOWN;
+
+    t5_hparams hparams;
+
+    // shared
+    struct ggml_tensor * shared;
+
+    // encoder.embed_tokens
+    struct ggml_tensor * e_et;
+
+    // encoder.final_layer_norm
+    struct ggml_tensor * e_ln;
+
+    // encoder.block.0.layer.0.SelfAttention.relative_attention_bias
+    struct ggml_tensor * e_rab;
+
+    // decoder.embed_tokens
+    struct ggml_tensor * d_et;
+
+    // decoder.final_layer_norm
+    struct ggml_tensor * d_ln;
+
+    // decoder.block.0.layer.0.SelfAttention.relative_attention_bias
+    struct ggml_tensor * d_rab;
+
+    // lm_head
+    struct ggml_tensor * lm_head;
+
+    std::vector<t5_layer_encoder> layers_encoder;
+    std::vector<t5_layer_decoder> layers_decoder;
+
+    // context
+    struct ggml_context * ctx;
+    struct ggml_context * ctx_mem;
+
+    // tensors
+    int n_loaded;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+struct t5_context {
+    int64_t t_load_us   = 0;
+    int64_t t_sample_us = 0;
+    int64_t t_encode_us = 0;
+    int64_t t_decode_us = 0;
+    int64_t t_start_us  = 0;
+
+    std::vector<uint8_t> buf_model;
+    std::vector<uint8_t> buf_memory;
+    std::vector<uint8_t> buf_compute;
+    std::vector<uint8_t> buf_compute_layer;
+
+    t5_model model;
+    t5_vocab vocab;
+
+    std::vector<float> probs;
+    std::vector<float> logits;
+};
+
+template<typename T>
+static void read_safe(std::ifstream& fin, T& dest) {
+    fin.read((char*)& dest, sizeof(T));
+}
+
+static bool t5_model_load(const std::string & fname, t5_context & wctx) {
+    fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto & model = wctx.model;
+    auto & vocab = wctx.vocab;
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        read_safe(fin, magic);
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    //load hparams
+    {
+        auto & hparams = model.hparams;
+
+        read_safe(fin, hparams.n_vocab);
+        read_safe(fin, hparams.d_ff);
+        read_safe(fin, hparams.d_kv);
+        read_safe(fin, hparams.d_model);
+        read_safe(fin, hparams.n_positions);
+        read_safe(fin, hparams.n_head);
+        read_safe(fin, hparams.n_layer);
+        read_safe(fin, hparams.f16);
+
+        assert(hparams.n_text_state == hparams.n_audio_state);
+
+        if (hparams.n_layer == 8) {
+            model.type = e_model::MODEL_SMALL;
+        }
+
+        if (hparams.n_layer == 12) {
+            model.type = e_model::MODEL_BASE;
+        }
+
+        if (hparams.n_layer == 24 && hparams.n_head == 16) {
+            model.type = e_model::MODEL_LARGE;
+        }
+
+        if (hparams.n_layer == 24 && hparams.n_head == 32) {
+            model.type = e_model::MODEL_XL;
+        }
+
+        if (hparams.n_layer == 24 && hparams.n_head == 64) {
+            model.type = e_model::MODEL_XXL;
+        }
+
+        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: d_ff          = %d\n", __func__, hparams.d_ff);
+        fprintf(stderr, "%s: d_kv          = %d\n", __func__, hparams.d_kv);
+        fprintf(stderr, "%s: d_model       = %d\n", __func__, hparams.d_model);
+        fprintf(stderr, "%s: n_positions   = %d\n", __func__, hparams.n_positions);
+        fprintf(stderr, "%s: n_head        = %d\n", __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer       = %d\n", __func__, hparams.n_layer);
+        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
+
+        wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
+        wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
+        wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
+        wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        read_safe(fin, n_vocab);
+
+        //if (n_vocab != model.hparams.n_vocab) {
+        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+        //    return false;
+        //}
+
+        std::string word;
+        std::vector<char> tmp;
+
+        tmp.reserve(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            read_safe(fin, len);
+
+            if (len > 0) {
+                tmp.resize(len);
+                fin.read(&tmp[0], tmp.size()); // read to buffer
+                word.assign(&tmp[0], tmp.size());
+            } else {
+                // seems like we have an empty-string token in multi-language models (i = 50256)
+                //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
+                word = "";
+            }
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+
+            //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+        }
+
+        vocab.n_vocab = model.hparams.n_vocab;
+
+        if (n_vocab < model.hparams.n_vocab) {
+            fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
+            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
+                word = "[_extra_token_" + std::to_string(i) + "]";
+                vocab.token_to_id[word] = i;
+                vocab.id_to_token[i] = word;
+            }
+        }
+
+        wctx.logits.reserve(vocab.n_vocab*model.hparams.d_model);
+        wctx.probs.reserve(vocab.n_vocab*model.hparams.d_model);
+    }
+
+    {
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+                   wctx.buf_model.size() +
+                   wctx.buf_memory.size() +
+                   wctx.buf_compute.size() +
+                   wctx.buf_compute_layer.size();
+
+        fprintf(stderr, "%s: mem_required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats
+    // in order to save memory and also to speed up the computation
+    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_vocab = hparams.n_vocab;
+        const int d_ff    = hparams.d_ff;
+        const int d_kv    = hparams.d_kv;
+        const int d_model = hparams.d_model;
+        const int n_head  = hparams.n_head;
+        const int n_layer = hparams.n_layer;
+
+        ctx_size += n_vocab*d_model*ggml_type_size(wtype); // shared;
+        ctx_size += n_vocab*d_model*ggml_type_size(wtype); // lm_head;
+
+        // encoder
+        {
+            ctx_size += n_vocab*d_model*ggml_type_size(wtype); // e_et;
+            ctx_size +=         d_model*ggml_type_size(GGML_TYPE_F32); // e_ln
+            ctx_size +=       32*n_head*ggml_type_size(wtype); // e_rab
+        }
+
+        // decoder
+        {
+            ctx_size += n_vocab*d_model*ggml_type_size(wtype); // d_et;
+            ctx_size +=         d_model*ggml_type_size(GGML_TYPE_F32); // d_ln
+            ctx_size +=       32*n_head*ggml_type_size(wtype); // d_rab
+        }
+
+        // encoder layers
+        {
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o
+
+            ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0
+
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo
+
+            ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1
+        }
+
+        // decoder layers
+        {
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o
+
+            ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0
+
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_q
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_k
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_v
+            ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_o
+
+            ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1
+
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1
+            ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo
+
+            ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_2
+        }
+
+        ctx_size += (15 + 9*n_layer + 14*n_layer)*256; // object overhead
+
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params;
+        params.mem_size   = wctx.buf_model.size();
+        params.mem_buffer = wctx.buf_model.data();
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        auto & ctx = model.ctx;
+
+        const auto & hparams = model.hparams;
+
+        const int n_vocab = hparams.n_vocab;
+        const int d_ff    = hparams.d_ff;
+        const int d_kv    = hparams.d_kv;
+        const int d_model = hparams.d_model;
+        const int n_head  = hparams.n_head;
+        const int n_layer = hparams.n_layer;
+
+        model.layers_encoder.resize(n_layer);
+        model.layers_decoder.resize(n_layer);
+
+        // global
+        {
+            model.shared  = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
+            model.lm_head = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
+
+            model.tensors["shared.weight"]  = model.shared;
+            model.tensors["lm_head.weight"] = model.lm_head;
+        }
+
+        // encoder
+        {
+            model.e_et = ggml_new_tensor_2d(ctx,         wtype, d_model, n_vocab);
+            model.e_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+            model.e_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32);
+
+            // map by name
+            model.tensors["encoder.embed_tokens.weight"]     = model.e_et;
+            model.tensors["encoder.final_layer_norm.weight"] = model.e_ln;
+
+            model.tensors["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.e_rab;
+
+            for (int i = 0; i < n_layer; ++i) {
+                auto & layer = model.layers_encoder[i];
+
+                layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
+
+                layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+                layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
+                layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
+                layer.wo   = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model);
+
+                layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+                // map by name
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q;
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k;
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v;
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o;
+
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0;
+
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_0.weight"] = layer.wi_0;
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_1.weight"] = layer.wi_1;
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wo.weight"]   = layer.wo;
+
+                model.tensors["encoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1;
+            }
+        }
+
+        // decoder
+        {
+            model.d_et = ggml_new_tensor_2d(ctx,         wtype, d_model, n_vocab);
+            model.d_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+            model.d_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32);
+
+            // map by name
+            model.tensors["decoder.embed_tokens.weight"]     = model.d_et;
+            model.tensors["decoder.final_layer_norm.weight"] = model.d_ln;
+
+            model.tensors["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.d_rab;
+
+            for (int i = 0; i < n_layer; ++i) {
+                auto & layer = model.layers_decoder[i];
+
+                layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
+
+                layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+                layer.cross_attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.cross_attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.cross_attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
+                layer.cross_attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
+
+                layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+                layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
+                layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
+                layer.wo   = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model);
+
+                layer.ln_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
+
+                // map by name
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o;
+
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0;
+
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.q.weight"] = layer.cross_attn_q;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.k.weight"] = layer.cross_attn_k;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.v.weight"] = layer.cross_attn_v;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.o.weight"] = layer.cross_attn_o;
+
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1;
+
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_0.weight"] = layer.wi_0;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_1.weight"] = layer.wi_1;
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wo.weight"]   = layer.wo;
+
+                model.tensors["decoder.block." + std::to_string(i) + ".layer.2.layer_norm.weight"] = layer.ln_2;
+            }
+        }
+    }
+
+    // create the ggml memory context
+    {
+        struct ggml_init_params params;
+        params.mem_size   = wctx.buf_memory.size();
+        params.mem_buffer = wctx.buf_memory.data();
+
+        model.ctx_mem = ggml_init(params);
+        if (!model.ctx_mem) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // key + value memory
+    //{
+    //    auto & ctx = model.ctx_mem;
+
+    //    const auto & hparams = model.hparams;
+
+    //    const int n_text_state = hparams.n_text_state;
+    //    const int n_text_layer = hparams.n_text_layer;
+    //    const int n_text_ctx   = hparams.n_text_ctx;
+
+    //    // key/value memory for the self-attention layer
+    //    {
+    //        const int n_mem      = n_text_layer*n_text_ctx;
+    //        const int n_elements = n_text_state*n_mem;
+
+    //        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+    //        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+    //    }
+
+    //    // key/value memory for the cross-attention layer
+    //    {
+    //        const int n_audio_ctx = hparams.n_audio_ctx;
+
+    //        const int n_mem      = n_text_layer*n_audio_ctx;
+    //        const int n_elements = n_text_state*n_mem;
+
+    //        model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+    //        model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+    //    }
+
+    //    const size_t memory_size =
+    //        ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
+    //        ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
+
+    //    fprintf(stderr, "%s: memory size   = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
+    //}
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        model.n_loaded = 0;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
+
+            read_safe(fin, n_dims);
+            read_safe(fin, length);
+            read_safe(fin, ftype);
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[3] = { 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(fin, ne[i]);
+                nelements *= ne[i];
+            }
+
+            std::string name;
+            std::vector<char> tmp(length); // create a buffer
+            fin.read(&tmp[0], tmp.size()); // read to buffer
+            name.assign(&tmp[0], tmp.size());
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
+                return false;
+            }
+
+            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+
+            if (nelements*bpe != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+            model.n_loaded++;
+        }
+
+        fprintf(stderr, "%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
+
+        if (model.n_loaded == 0) {
+            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
+    }
+
+    fin.close();
+
+    return true;
+}
+
+struct t5_context * t5_init(const char * path_model) {
+    ggml_time_init();
+
+    t5_context * ctx = new t5_context;
+
+    const int64_t t_start_us = ggml_time_us();
+
+    ctx->t_start_us = t_start_us;
+
+    if (!t5_model_load(path_model, *ctx)) {
+        fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
+        delete ctx;
+        return nullptr;
+    }
+
+    ctx->t_load_us = ggml_time_us() - t_start_us;
+
+    return ctx;
+}
+
+void t5_free(struct t5_context * ctx) {
+    if (ctx) {
+        if (ctx->model.ctx) {
+            ggml_free(ctx->model.ctx);
+        }
+        if (ctx->model.ctx_mem) {
+            ggml_free(ctx->model.ctx_mem);
+        }
+        delete ctx;
+    }
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "usage: %s <model>\n", argv[0]);
+        return -1;
+    }
+
+    const char * path_model = argv[1];
+
+    t5_context * ctx = t5_init(path_model);
+    if (!ctx) {
+        fprintf(stderr, "%s: failed to initialize T5 context\n", __func__);
+        return -1;
+    }
+
+    fprintf(stderr, "%s: model loaded in %7.2f ms\n", __func__, ctx->t_load_us/1000.0);
+
+    t5_free(ctx);
+
+    return 0;
+}
--- a/examples/utils.cpp
+++ b/examples/utils.cpp
@ -261,11 +261,8 @@ gpt_vocab::id gpt_sample_top_k_top_p(
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

-    {
-        const double scale = 1.0/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            logits_id.push_back(std::make_pair(logits[i]*scale, i));
-        }
+    for (int i = 0; i < n_logits; i++) {
+        logits_id.push_back(std::make_pair(logits[i], i));
    }

    // find the top K tokens
@ -278,51 +275,59 @@ gpt_vocab::id gpt_sample_top_k_top_p(

    logits_id.resize(top_k);

-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
+    // normalize
+    {
+        double sum = 0.0f;
+        for (int i = 0; i < (int)logits_id.size(); i++) {
+            sum += logits_id[i].first;
+        }

-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
+        sum = 1.0/sum;
+        for (int i = 0; i < (int)logits_id.size(); i++) {
+            logits_id[i].first *= sum;
+        }
    }

    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
+        {
+            double cumsum = 0.0f;
+            for (int i = 0; i < top_k; i++) {
+                cumsum += logits_id[i].first;
+                if (cumsum >= top_p) {
+                    logits_id.resize(i+1);
+                    break;
+                }
            }
        }

-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
+        // normalize again
+        {
+            double sum = 0.0f;
+            for (int i = 0; i < (int)logits_id.size(); i++) {
+                sum += logits_id[i].first;
+            }
+
+            sum = 1.0/sum;
+            for (int i = 0; i < (int)logits_id.size(); i++) {
+                logits_id[i].first *= sum;
+            }
        }
    }

    //printf("\n");
-    //for (int i = 0; i < (int) probs.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //for (int i = 0; i < (int)logits_id.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);

+    // sample from the obtained distribution
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    for (int i = 0; i < (int) logits_id.size(); i++) {
+        probs.push_back(logits_id[i].first);
+    }
+
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);

--- a/examples/whisper/CMakeLists.txt
+++ b/examples/whisper/CMakeLists.txt
@ -1,7 +1,7 @@
 #
 # whisper

-add_library(whisper-cpp
+add_library(whisper-cpp SHARED
    whisper.cpp
    )

@ -10,6 +10,6 @@ target_link_libraries(whisper-cpp PRIVATE
    )

 set(TEST_TARGET whisper)
-add_executable(${TEST_TARGET} main.cpp common.cpp)
+add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE whisper-cpp)
 target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
--- a/examples/whisper/common.cpp
+++ b/examples/whisper/common.cpp
@ -1,162 +0,0 @@
-#include "common.h"
-
-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
-#include <cmath>
-#include <regex>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-
-bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname == "-") {
-        {
-            uint8_t buf[1024];
-            while (true)
-            {
-                const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                if (n == 0) {
-                    break;
-                }
-                wav_data.insert(wav_data.end(), buf, buf + n);
-            }
-        }
-
-        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from stdin\n");
-            return false;
-        }
-
-        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    }
-    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-        return false;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (stereo && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
-        return false;
-    }
-
-    if (wav.bitsPerSample != 16) {
-        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-        return false;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (stereo) {
-        // convert to stereo, float
-        pcmf32s.resize(2);
-
-        pcmf32s[0].resize(n);
-        pcmf32s[1].resize(n);
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-        }
-    }
-
-    return true;
-}
-
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-
-    float y = data[0];
-
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-
-    return true;
-}
--- a/examples/whisper/common.h
+++ b/examples/whisper/common.h
@ -1,40 +0,0 @@
-#pragma once
-
-// needs to match WHISPER_SAMPLE_RATE
-#define COMMON_SAMPLE_RATE 16000
-
-#include <vector>
-#include <string>
-
-std::string trim(const std::string & s);
-
-std::string replace(
-        const std::string & s,
-        const std::string & from,
-        const std::string & to);
-
-// Read WAV audio file and store the PCM data into pcmf32
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
-bool read_wav(
-        const std::string & fname,
-        std::vector<float> & pcmf32,
-        std::vector<std::vector<float>> & pcmf32s,
-        bool stereo);
-
-// Apply a high-pass frequency filter to PCM audio
-// Suppresses frequencies below cutoff Hz
-void high_pass_filter(
-        std::vector<float> & data,
-        float cutoff,
-        float sample_rate);
-
-// Basic voice activity detection (VAD) using audio energy adaptive threshold
-bool vad_simple(
-        std::vector<float> & pcmf32,
-        int   sample_rate,
-        int   last_ms,
-        float vad_thold,
-        float freq_thold,
-        bool  verbose);
-
--- a/examples/whisper/main.cpp
+++ b/examples/whisper/main.cpp
@ -1,7 +1,10 @@
-#include "common.h"
-
 #include "whisper.h"

+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
 #include <cmath>
 #include <fstream>
 #include <cstdio>
@ -50,24 +53,18 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
+    int32_t n_processors = 1;
+    int32_t offset_t_ms  = 0;
+    int32_t offset_n     = 0;
+    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      =  5;
-    int32_t beam_size    = -1;
+    int32_t max_len      = 0;

-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
+    float word_thold = 0.01f;

    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
-    bool split_on_word  = false;
-    bool no_fallback    = false;
    bool output_txt     = false;
    bool output_vtt     = false;
    bool output_srt     = false;
@ -83,7 +80,6 @@ struct whisper_params {
    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -92,11 +88,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-"){
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
@ -113,22 +104,15 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-d"    || arg == "--duration")       { params.duration_ms    = std::stoi(argv[++i]); }
        else if (arg == "-mc"   || arg == "--max-context")    { params.max_context    = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")        { params.max_len        = std::stoi(argv[++i]); }
-        else if (arg == "-bo"   || arg == "--best-of")        { params.best_of        = std::stoi(argv[++i]); }
-        else if (arg == "-bs"   || arg == "--beam-size")      { params.beam_size      = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")     { params.word_thold     = std::stof(argv[++i]); }
-        else if (arg == "-et"   || arg == "--entropy-thold")  { params.entropy_thold  = std::stof(argv[++i]); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")  { params.logprob_thold  = std::stof(argv[++i]); }
        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")  { params.split_on_word  = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@ -152,38 +136,31 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
+    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N   [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N     [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N     [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N     [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N  [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,    --max-len N      [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -wt N,    --word-thold N   [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,      --diarize        [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -otxt,    --output-txt     [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,    --output-vtt     [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,    --output-srt     [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -owts,    --output-words   [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -ocsv,    --output-csv     [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,      --print-colors   [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -pp,      --print-progress [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,      --no-timestamps  [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "            --prompt PROMPT  [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME    [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }

@ -199,81 +176,90 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

    const int n_segments = whisper_full_n_segments(ctx);

-    std::string speaker = "";
-
-    int64_t t0;
-    int64_t t1;
-
    // print the last n_new segments
    const int s0 = n_segments - n_new;
-
    if (s0 == 0) {
        printf("\n");
    }

    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
+        if (params.no_timestamps) {
+            if (params.print_colors) {
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
+                    }

-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);

-        if (params.diarize && pcmf32s.size() == 2) {
-            const int64_t n_samples = pcmf32s[0].size();
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

-            const int64_t is0 = timestamp_to_sample(t0, n_samples);
-            const int64_t is1 = timestamp_to_sample(t1, n_samples);
+                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                }
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+                printf("%s", text);
+            }
+            fflush(stdout);
+        } else {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-            double energy0 = 0.0f;
-            double energy1 = 0.0f;
+            std::string speaker;

-            for (int64_t j = is0; j < is1; j++) {
-                energy0 += fabs(pcmf32s[0][j]);
-                energy1 += fabs(pcmf32s[1][j]);
-            }
+            if (params.diarize && pcmf32s.size() == 2) {
+                const int64_t n_samples = pcmf32s[0].size();

-            if (energy0 > 1.1*energy1) {
-                speaker = "(speaker 0)";
-            } else if (energy1 > 1.1*energy0) {
-                speaker = "(speaker 1)";
-            } else {
-                speaker = "(speaker ?)";
-            }
+                const int64_t is0 = timestamp_to_sample(t0, n_samples);
+                const int64_t is1 = timestamp_to_sample(t1, n_samples);

-            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
-        }
+                double energy0 = 0.0f;
+                double energy1 = 0.0f;

-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
+                for (int64_t j = is0; j < is1; j++) {
+                    energy0 += fabs(pcmf32s[0][j]);
+                    energy1 += fabs(pcmf32s[1][j]);
                }

-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
+                if (energy0 > 1.1*energy1) {
+                    speaker = "(speaker 0)";
+                } else if (energy1 > 1.1*energy0) {
+                    speaker = "(speaker 1)";
+                } else {
+                    speaker = "(speaker ?)";
+                }

-                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
            }
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);

-            printf("%s%s", speaker.c_str(), text);
-        }
+            if (params.print_colors) {
+                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
+                    }

-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

-        fflush(stdout);
+                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+                }
+                printf("\n");
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+
+                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
+            }
+        }
    }
 }

@ -354,16 +340,20 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
+	if (text[0] == ' ')
+	  text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
+	//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
+        fout << 10 * t0 << ", " 
+	     << 10 * t1 << ", \"" 
+	     << text    << "\"\n";
    }

    return true;
 }

+
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
@ -497,7 +487,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+    struct whisper_context * ctx = whisper_init(params.model.c_str());

    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
@ -522,14 +512,90 @@ int main(int argc, char ** argv) {

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];

-        std::vector<float> pcmf32;               // mono-channel F32 PCM
+        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
+        // WAV input
+        {
+            drwav wav;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+            if (fname_inp == "-") {
+                {
+                    uint8_t buf[1024];
+                    while (true)
+                    {
+                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                        if (n == 0) {
+                            break;
+                        }
+                        wav_data.insert(wav_data.end(), buf, buf + n);
+                    }
+                }
+
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
+                    return 4;
+                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+            }
+            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
+                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
+                return 5;
+            }
+
+            if (wav.channels != 1 && wav.channels != 2) {
+                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                return 8;
+            }
+
+            if (wav.bitsPerSample != 16) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
+                return 9;
+            }
+
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+
+            std::vector<int16_t> pcm16;
+            pcm16.resize(n*wav.channels);
+            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+            drwav_uninit(&wav);
+
+            // convert to mono, float
+            pcmf32.resize(n);
+            if (wav.channels == 1) {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[i])/32768.0f;
+                }
+            } else {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+                }
+            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -563,8 +629,6 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
            wparams.print_timestamps = !params.no_timestamps;
@ -579,19 +643,11 @@ int main(int argc, char ** argv) {
            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.split_on_word    = params.split_on_word;

            wparams.speed_up         = params.speed_up;

-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
+            wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();

            whisper_print_user_data user_data = { &params, &pcmf32s };

@ -626,33 +682,34 @@ int main(int argc, char ** argv) {

            // output to text file
            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
+                const auto fname_txt = fname_inp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }

            // output to VTT file
            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
+                const auto fname_vtt = fname_inp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }

            // output to SRT file
            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
+                const auto fname_srt = fname_inp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }

            // output to WTS file
            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
+                const auto fname_wts = fname_inp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }

-            // output to CSV file
+	    // output to CSV file
            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
+                const auto fname_csv = fname_inp + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
+
        }
    }

--- a/examples/whisper/whisper.cpp
+++ b/examples/whisper/whisper.cpp
--- a/examples/whisper/whisper.h
+++ b/examples/whisper/whisper.h
@ -1,7 +1,6 @@
 #ifndef WHISPER_H
 #define WHISPER_H

-#include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>

@ -41,7 +40,7 @@ extern "C" {
    //
    //     ...
    //
-    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+    //     struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
    //
    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
    //         fprintf(stderr, "failed to process audio\n");
@ -74,7 +73,6 @@ extern "C" {
        whisper_token tid; // forced timestamp token id

        float p;           // probability of the token
-        float plog;        // log probability of the token
        float pt;          // probability of the timestamp token
        float ptsum;       // sum of probabilities of all timestamp tokens

@ -86,20 +84,9 @@ extern "C" {
        float vlen;        // voice length of the token
    } whisper_token_data;

-    typedef struct whisper_model_loader {
-        void * context;
-
-        size_t (*read)(void * ctx, void * output, size_t read_size);
-        bool    (*eof)(void * ctx);
-        void  (*close)(void * ctx);
-    } whisper_model_loader;
-
-    // Various functions for loading a ggml whisper model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
+    // Allocates all memory needed for the model and loads the model from the given file.
+    // Returns NULL on failure.
+    WHISPER_API struct whisper_context * whisper_init(const char * path_model);

    // Frees all memory allocated by the model.
    WHISPER_API void whisper_free(struct whisper_context * ctx);
@ -113,16 +100,6 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
-    // The resulting spectrogram is stored inside the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context* ctx,
-        const float* samples,
-        int   n_samples,
-        int   n_threads);
-
-
    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
@ -147,7 +124,6 @@ extern "C" {
    // tokens + n_tokens is the provided context for the decoder.
    // n_past is the number of tokens to use from previous decoder calls.
    // Returns 0 on success
-    // TODO: add support for multiple decoders
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
               const whisper_token * tokens,
@ -155,6 +131,14 @@ extern "C" {
                               int   n_past,
                               int   n_threads);

+    // Token sampling methods.
+    // These are provided for convenience and can be used after each call to whisper_decode().
+    // You can also implement your own sampling method using the whisper_get_probs() function.
+    // whisper_sample_best() returns the token with the highest probability
+    // whisper_sample_timestamp() returns the most probable timestamp token
+    WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
+    WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
+
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -164,7 +148,7 @@ extern "C" {
            struct whisper_context * ctx,
                        const char * text,
                     whisper_token * tokens,
-                               int   n_max_tokens);
+	                           int   n_max_tokens);

    // Largest language id (i.e. number of available languages - 1)
    WHISPER_API int whisper_lang_max_id();
@ -196,11 +180,8 @@ extern "C" {
    WHISPER_API int whisper_n_audio_ctx    (struct whisper_context * ctx);
    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);

-    // Token logits obtained from the last call to whisper_decode()
-    // The logits for the last token are stored in the last row
-    // Rows: n_tokens
-    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
+    // The probabilities for the next token
+    WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
@ -229,8 +210,8 @@ extern "C" {

    // Available sampling strategies
    enum whisper_sampling_strategy {
-        WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreefyDecoder
-        WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
+        WHISPER_SAMPLING_GREEDY,      // Always select the most probable token
+        WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
    };

    // Text segment callback
@ -243,16 +224,6 @@ extern "C" {
    // If it returns false, the computation is aborted
    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);

-    // Logits filter callback
-    // Can be used to modify the logits before sampling
-    // If not NULL, called after applying temperature to logits
-    typedef void (*whisper_logits_filter_callback)(
-            struct whisper_context * ctx,
-          const whisper_token_data * tokens,
-                               int   n_tokens,
-                             float * logits,
-                              void * user_data);
-
    // Parameters for the whisper_full() function
    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
    // whisper_full_default_params()
@ -260,32 +231,30 @@ extern "C" {
        enum whisper_sampling_strategy strategy;

        int n_threads;
-        int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
+        int n_max_text_ctx;
        int offset_ms;          // start offset in ms
        int duration_ms;        // audio duration to process in ms

        bool translate;
-        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
+        bool no_context;
        bool single_segment;    // force single segment output (useful for streaming)
-        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-        bool print_progress;    // print progress information
-        bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-        bool print_timestamps;  // print timestamps for each text segment when printing realtime
+        bool print_special;
+        bool print_progress;
+        bool print_realtime;
+        bool print_timestamps;

        // [EXPERIMENTAL] token-level timestamps
        bool  token_timestamps; // enable token-level timestamps
        float thold_pt;         // timestamp token probability threshold (~0.01)
        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
        int   max_len;          // max segment length in characters
-        bool  split_on_word;    // split on word rather than on token (when used with max_len)
        int   max_tokens;       // max tokens per segment (0 = no limit)

        // [EXPERIMENTAL] speed-up techniques
-        // note: these can significantly reduce the quality of the output
        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
        int  audio_ctx;         // overwrite the audio context size (0 = use default)

-        // tokens to provide to the whisper decoder as initial prompt
+        // tokens to provide the whisper model as initial prompt
        // these are prepended to any existing text context from a previous call
        const whisper_token * prompt_tokens;
        int prompt_n_tokens;
@ -293,42 +262,21 @@ extern "C" {
        // for auto-detection, set to nullptr, "" or "auto"
        const char * language;

-        // common decoding parameters:
-        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
-        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-
-        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
-        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
-        float length_penalty;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
-
-        // fallback parameters
-        // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
-        float temperature_inc;
-        float entropy_thold;    // similar to OpenAI's "compression_ratio_threshold"
-        float logprob_thold;
-        float no_speech_thold;  // TODO: not implemented
-
        struct {
-            int best_of;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
+            int n_past;
        } greedy;

        struct {
-            int beam_size;  // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
-
-            float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
+            int n_past;
+            int beam_width;
+            int n_best;
        } beam_search;

-        // called for every newly generated text segment
        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;

-        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
-
-        // called by each decoder to filter obtained logits
-        whisper_logits_filter_callback logits_filter_callback;
-        void * logits_filter_callback_user_data;
    };

    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -355,9 +303,6 @@ extern "C" {
    // A segment can be a few words, a sentence, or even a paragraph.
    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);

-    // Language id associated with the current context
-    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
-
    // Get the start and end time of the specified segment.
    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
@ -379,13 +324,6 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

-    ////////////////////////////////////////////////////////////////////////////
-
-    // Temporary helpers needed for exposing ggml interface
-
-    WHISPER_API int whisper_bench_memcpy(int n_threads);
-    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
-
 #ifdef __cplusplus
 }
 #endif
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@ -301,13 +301,6 @@ struct ggml_cgraph {
    int64_t perf_time_us;
 };

-// scratch buffer
-struct ggml_scratch {
-    size_t offs;
-    size_t size;
-    void * data;
-};
-
 struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
@ -334,8 +327,6 @@ void ggml_free(struct ggml_context * ctx);

 size_t ggml_used_mem(const struct ggml_context * ctx);

-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -740,8 +731,6 @@ int ggml_cpu_has_f16c(void);
 int ggml_cpu_has_fp16_va(void);
 int ggml_cpu_has_wasm_simd(void);
 int ggml_cpu_has_blas(void);
-int ggml_cpu_has_sse3(void);
-int ggml_cpu_has_vsx(void);

 #ifdef  __cplusplus
 }
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -9,7 +9,6 @@ if (GGML_ALL_WARNINGS)
            -Wcast-qual                     \
            -Wstrict-prototypes             \
            -Wpointer-arith                 \
-            -Wno-unused-function            \
        ")
    else()
        # todo : windows
@ -18,101 +17,17 @@ endif()

 # compiler flags

-if (NOT MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
-endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")

 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")

-if (NOT UNAME_S)
-    execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
-endif()
-if (NOT UNAME_P)
-    execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
-endif()
-if (NOT UNAME_M)
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
-endif()
-message(STATUS "UNAME_S: ${UNAME_S}  UNAME_P: ${UNAME_P}  UNAME_M: ${UNAME_M}")
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-if (UNAME_S MATCHES "Darwin")
-    if (NOT UNAME_P MATCHES "arm")
-        execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
-	if (SYSCTL_M MATCHES "1")
-            #set(UNAME_P "arm")
-            #set(UNAME_M "arm64")
-	    message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
-	endif()
-    endif()
-endif()
-
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
 else()
    message(STATUS "x86 detected")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
-    if (UNAME_S MATCHES "Darwin")
-        execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "AVX1.0")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-	execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-	if (AVX1_M MATCHES "FMA")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-elseif (UNAME_S MATCHES "Linux")
-        message(STATUS "Linux detected")
-	execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-	execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-	execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
-	if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-	execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
-	if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-	execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
-	if (SSE3_M MATCHES "sse3")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
-        endif()
-	message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-elseif (UNAME_S MATCHES "Haiku")
-	message(STATUS "Haiku detected")
-	execute_process(COMMAND sysinfo -cpu | grep "AVX " OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-	execute_process(COMMAND sysinfo -cpu | grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-	execute_process(COMMAND sysinfo -cpu | grep "FMA " OUTPUT_VARIABLE FMA_M)
-	if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-	execute_process(COMMAND sysinfo -cpu | grep "F16C " OUTPUT_VARIABLE F16C_M)
-	if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-	message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-    else()
-        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
-    endif()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
 endif()


@ -147,11 +62,7 @@ target_include_directories(${TARGET} PUBLIC
    ../include/ggml
    )

-if (MSVC)
-    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
-else()
-    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
-endif()
+target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

 if (BUILD_SHARED_LIBS)
    target_link_libraries(${TARGET} PUBLIC
--- a/src/ggml.c
+++ b/src/ggml.c
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -65,14 +65,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 endif()

-#
-# test-mul-mat2
-
-set(TEST_TARGET test-mul-mat2)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-
 #
 # test0

@ -104,15 +96,3 @@ set(TEST_TARGET test3)
 add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-
-#
-# test-svd0 (arm)
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
-    set(TEST_TARGET test-svd0)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-endif()
-
--- a/tests/test-mul-mat1.c
+++ b/tests/test-mul-mat1.c
@ -13,20 +13,14 @@
 #include <Accelerate/Accelerate.h>

 const int M = 1280;
-const int N = 1536;
+const int N = 1500;
 const int K = 1280;

-uint64_t get_time_us() {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
 //
 // naive implementation
 //

-void mul_mat_f32_0(
+void mul_mat_vec_f32_0(
    const float * restrict src0, // M x K
    const float * restrict src1, // N x K (transposed)
    float * dst,
@ -42,7 +36,7 @@ void mul_mat_f32_0(
    }
 }

-void mul_mat_f16_0(
+void mul_mat_vec_f16_0(
    const __fp16 * src0,
    const __fp16 * src1,
           float * dst,
@ -108,7 +102,7 @@ void mul_mat_f16_0(
 }

 // blocking with block size 32
-void mul_mat_f16_1(
+void mul_mat_vec_f16_1(
    const __fp16 * src0,
    const __fp16 * src1,
           float * dst,
@ -180,7 +174,7 @@ void mul_mat_f16_1(

 }

-void mul_mat_f8_0(
+void mul_mat_vec_f8_0(
    const uint8_t * src0,
    const uint8_t * src1,
           float * dst,
@ -212,6 +206,12 @@ void mul_mat_f8_0(
    }
 }

+uint64_t get_time_us() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
 int main(int argc, const char ** argv) {
    float * src0 = (float *)malloc(sizeof(float)*M*K);
    float * src1 = (float *)malloc(sizeof(float)*N*K);
@ -258,7 +258,7 @@ int main(int argc, const char ** argv) {
        method = atoi(argv[1]);
    }

-    const int nIter = 1;
+    const int nIter = 10000;

    const clock_t start = clock();
    const uint64_t start_us = get_time_us();
@ -267,24 +267,24 @@ int main(int argc, const char ** argv) {
    double sum = 0.0f;
    for (int i = 0; i < nIter; i++) {
        if (method == 0) {
-            mul_mat_f32_0(src0, src1, dst, M, N, K);
+            mul_mat_vec_f32_0(src0, src1, dst, M, N, K);
        }

        if (method == 1) {
-            mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
+            mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
        }

        if (method == 2) {
-            mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
+            mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
        }

        if (method == 3) {
-            mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
+            mul_mat_vec_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
        }

        if (method == 4) {
            // Use BLAS sgemm from Accelerate framework
-            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, src0, K, src1, K, 0.0f, dst, N);
+            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, K, N, 1.0f, src0, N, src1, N, 0.0f, dst, N);
        }
    }

--- a/tests/test-mul-mat2.c
+++ b/tests/test-mul-mat2.c
@ -1,475 +0,0 @@
-// quantized matrix multiplication
-
-#include "ggml.h"
-
-#include <float.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#ifdef __ARM_NEON
-#include "arm_neon.h"
-#endif
-
-#ifndef MIN
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-const int M = 1280;
-const int N = 1536;
-const int K = 1280;
-
-const int QK = 64;
-#define QB 7
-
-//#define GGML_GQ_USE_FP16_SCALE
-
-#if defined(GGML_GQ_USE_FP16_SCALE)
-#define gq_scale_t ggml_fp16_t
-#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
-#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
-#else
-#define gq_scale_t float
-#define GGML_FP32_TO_GQ(x) (x)
-#define GGML_GQ_TO_FP32(x) (x)
-#endif
-
-#define gq_quant_t uint64_t
-#define gq_t_bits 64
-
-uint64_t get_time_us() {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-//
-// naive implementation
-//
-
-void mul_mat_f32_naive(
-    const float * restrict src0, // M x K
-    const float * restrict src1, // N x K (transposed)
-    float * dst,
-    int m, int n, int k) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sum = 0;
-            for (int l = 0; l < k; l++) {
-                sum += src0[i*k + l] * src1[j*k + l];
-            }
-            dst[i*n + j] = sum;
-        }
-    }
-}
-
-//
-// method 1
-//
-
-void quantize_1(const float * src, void * dst, int n, int k) {
-    char * p0 = dst;
-
-    gq_quant_t pp[QB];
-
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < k/QK; i++) {
-            float min = FLT_MAX;
-            float max = -FLT_MAX;
-
-            // find min/max
-#ifdef __ARM_NEON
-            {
-                float32x4_t minv = vdupq_n_f32(FLT_MAX);
-                float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
-
-                for (int l = 0; l < QK; l += 4) {
-                    float32x4_t v = vld1q_f32(src + j*k + i*QK + l);
-                    minv = vminq_f32(minv, v);
-                    maxv = vmaxq_f32(maxv, v);
-                }
-
-                float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
-                float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
-
-                min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
-                max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
-
-                //printf("SIMD min/max: %f %f\n", min, max);
-            }
-#else
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j*k + i*QK + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                //printf("NORM min/max: %f %f\n", min, max);
-            }
-#endif
-
-            const float d = (max - min) / ((1 << QB) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            memcpy(p0, &min, sizeof(float)); p0 += sizeof(float);
-            memcpy(p0, &d,   sizeof(float)); p0 += sizeof(float);
-
-            //printf("min/max/d/id: %f %f %f %f\n", min, max, d, id);
-
-            for (int s = 0; s < QK/gq_t_bits; ++s) {
-                memset(pp, 0, sizeof(pp));
-
-                for (int l = 0; l < gq_t_bits; l++) {
-                    const   float v = src[j*k + i*QK + s*gq_t_bits + l];
-                    const uint8_t q = (v - min)*id;
-
-                    for (int b = 0; b < QB; b++) {
-                        pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
-                    }
-                }
-
-                for (int b = 0; b < QB; b++) {
-                    memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
-                }
-            }
-        }
-    }
-}
-
-void mul_mat_gq_1(
-    const void * src0,
-    const void * src1,
-         float * dst,
-    int m, int n, int k) {
-    const int kp = k & ~(gq_t_bits - 1);
-
-    const char * restrict p0 = src0;
-    const char * restrict p1 = src1;
-
-    float s0[QB + 1];
-    float s1[QB + 1];
-
-    gq_quant_t m0[QB + 1];
-    gq_quant_t m1[QB + 1];
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            float sumf = 0.0;
-
-            const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
-            const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
-
-            for (int i = 0; i < kp/QK; i++) {
-                float min0, d0;
-                memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
-                memcpy(&d0,   pp0, sizeof(float)); pp0 += sizeof(float);
-
-                float min1, d1;
-                memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float);
-                memcpy(&d1,   pp1, sizeof(float)); pp1 += sizeof(float);
-
-                //printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
-
-#if 1
-                // >>> General case for any QB
-
-                s0[0] = min0;
-                s1[0] = min1;
-
-                for (int b = 0; b < QB; b++) {
-                    s0[b + 1] = d0*(1 << b);
-                    s1[b + 1] = d1*(1 << b);
-                }
-
-                m0[0] = -1ULL;
-                m1[0] = -1ULL;
-
-                for (int s = 0; s < QK/gq_t_bits; ++s) {
-                    for (int b = 0; b < QB; b++) {
-                        memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
-                        memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
-                    }
-
-                    for (int q0 = 0; q0 < QB + 1; q0++) {
-                        for (int q1 = 0; q1 < QB + 1; q1++) {
-                            sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]);
-                        }
-                    }
-                }
-#else
-#endif
-            }
-
-            dst[ir0*n + ir1] = sumf;
-        }
-    }
-}
-
-//
-// method 2
-//
-
-static inline int quantize_2_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_2_quants_per_block() {
-    return QK/gq_t_bits;
-}
-
-static inline int quantize_2_row_size(int k) {
-    const int nb = quantize_2_blocks_per_row(k);
-    const int nq = quantize_2_quants_per_block();
-
-    return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
-}
-
-void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_2_blocks_per_row(k);
-    const int nq = quantize_2_quants_per_block();
-
-    gq_scale_t * restrict pm = (gq_scale_t *) (dst);
-    gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
-    gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
-
-    gq_quant_t pp[QB];
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK; l++) {
-            const float v = src[i*QK + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << QB) - 1);
-        const float id = d ? 1.0/d : 0.0;
-
-        pm[i] = GGML_FP32_TO_GQ(min);
-        pd[i] = GGML_FP32_TO_GQ(d);
-
-        for (int s = 0; s < nq; ++s) {
-            memset(pp, 0, sizeof(pp));
-
-            for (int l = 0; l < gq_t_bits; l++) {
-                const   float v = src[i*QK + s*gq_t_bits + l];
-                const uint8_t q = (v - min)*id;
-
-                for (int b = 0; b < QB; b++) {
-                    pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
-                }
-            }
-
-            for (int b = 0; b < QB; b++) {
-                pb[i*nq*QB + s*QB + b] = pp[b];
-            }
-        }
-    }
-}
-
-// reimplementation of quantize_2 using quantize_2_row
-void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_2_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_2_row_size(k);
-    }
-}
-
-void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    float sumf[(QB + 1)*(QB + 1)];
-    memset(sumf, 0, sizeof(sumf));
-
-    const int nb = quantize_2_blocks_per_row(n);
-    const int nq = quantize_2_quants_per_block();
-
-    const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
-
-    const gq_scale_t * restrict pd0 = pm0 + nb;
-    const gq_scale_t * restrict pd1 = pm1 + nb;
-
-    const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
-    const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
-
-#if 1
-    float s0[QB + 1];
-    float s1[QB + 1];
-
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        s0[0] = m0;
-        s1[0] = m1;
-
-        for (int b = 0; b < QB; b++) {
-            s0[b + 1] = d0*(1 << b);
-            s1[b + 1] = d1*(1 << b);
-        }
-
-        for (int s = 0; s < nq; ++s) {
-            for (int q0 = 0; q0 < QB + 1; q0++) {
-                const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
-                for (int q1 = 0; q1 < QB + 1; q1++) {
-                    const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
-                    sumf[q0*(QB + 1) + q1] += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
-                }
-            }
-        }
-    }
-#else
-    // SIMD-ify with the assumptions:
-    // - nb is a multiple of 4
-    // - gq_scale_t is float
-    // - gq_quant_t is uint64_t
-    // - QB == 7
-    assert(nb % 4 == 0);
-
-#ifdef __ARM_NEON
-#else
-    // TODO
-#endif
-
-#endif
-
-    for (int q0 = 0; q0 < QB + 1; q0++) {
-        for (int q1 = 1; q1 < QB + 1; q1++) {
-            sumf[q0*(QB + 1)] += sumf[q0*(QB + 1) + q1];
-        }
-    }
-
-    *s = sumf[0];
-    for (int q0 = 1; q0 < QB + 1; q0++) {
-        *s += sumf[q0*(QB + 1)];
-    }
-}
-
-// use vec_dot_gq_2 to compute the dot product of two rows
-void mul_mat_gq_2(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_2_blocks_per_row(k);
-    const int nq = quantize_2_quants_per_block();
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_2(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_2_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_2_row_size(k);
-        src1 = (const char *) src1 - n*quantize_2_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-int main(int argc, const char ** argv) {
-    assert(sizeof(gq_quant_t)*8 == gq_t_bits);
-
-    float * src0 = (float *)malloc(sizeof(float)*M*K);
-    float * src1 = (float *)malloc(sizeof(float)*N*K);
-    float * dst  = (float *)malloc(sizeof(float)*M*N);
-
-    for (int i = 0; i < M*K; i++) {
-        src0[i] = rand() / (float)RAND_MAX;
-    }
-
-    for (int i = 0; i < N*K; i++) {
-        src1[i] = rand() / (float)RAND_MAX;
-    }
-
-    void * src0_gq = calloc(1, quantize_2_row_size(K)*M);
-    void * src1_gq = calloc(1, quantize_2_row_size(K)*N);
-
-    const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
-    const size_t sizegq  = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
-
-    printf("compression: %f\n", (float)sizegq/sizef16);
-
-    int method = 0;
-    if (argc > 1) {
-        method = atoi(argv[1]);
-    }
-
-    // convert fp32 -> gq
-    {
-        const uint64_t t_start = get_time_us();
-
-        if (method == 1) {
-            quantize_1(src0, src0_gq, M, K);
-            quantize_1(src1, src1_gq, N, K);
-        }
-
-        if (method == 2) {
-            quantize_2(src0, src0_gq, M, K);
-            quantize_2(src1, src1_gq, N, K);
-        }
-
-        const uint64_t t_end = get_time_us();
-        printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method);
-    }
-
-    const int nIter = 1;
-
-    const clock_t start = clock();
-    const uint64_t start_us = get_time_us();
-
-    double iM = 1.0/M;
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        if (method == 0) {
-            mul_mat_f32_naive(src0, src1, dst, M, N, K);
-        }
-
-        if (method == 1) {
-            mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 2) {
-            mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
-        }
-    }
-
-    for (int i = 0; i < N; i++) {
-        sum += dst[i]*iM;
-    }
-
-    {
-        const clock_t end = clock();
-        const uint64_t end_us = get_time_us();
-        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
-        printf("%s: elapsed us:    %d / %f ms\n",  __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter);
-    }
-
-    printf("%f\n", sum);
-
-    free(src0);
-    free(src1);
-    free(dst);
-
-    free(src0_gq);
-    free(src1_gq);
-
-    return 0;
-}
--- a/tests/test-svd0.c
+++ b/tests/test-svd0.c
@ -1,218 +0,0 @@
-// SVD dimensionality reduction
-
-#include <float.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#ifdef GGML_USE_ACCELERATE
-#include <Accelerate/Accelerate.h>
-#endif
-
-float frand() {
-    return (float) rand() / (float) RAND_MAX;
-}
-
-//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m,
-//        __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda,
-//        __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu,
-//        __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work,
-//        __CLPK_integer *__lwork,
-//        __CLPK_integer *__info)
-
-int main(int argc, const char ** argv) {
-    int m = 10;
-    int n = 5;
-
-    float * A  = (float *) malloc(n * m * sizeof(float));
-    float * A0 = (float *) malloc(n * m * sizeof(float));
-
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < m; ++j) {
-            A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand());
-            //A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand());
-            //if (i == 2) {
-            //    A[i * m + j] += 20*frand();
-            //}
-            if ((i == 1 || i == 3) && j > m/2) {
-                A[i * m + j] = -A[i * m + j];
-            }
-        }
-    }
-
-    // average vector
-    //float * M = (float *) malloc(m * sizeof(float));
-
-    //{
-    //    for (int j = 0; j < m; ++j) {
-    //        M[j] = 0.0f;
-    //    }
-    //    for (int i = 0; i < n; ++i) {
-    //        for (int j = 0; j < m; ++j) {
-    //            M[j] += A[i * m + j];
-    //        }
-    //    }
-    //    for (int j = 0; j < m; ++j) {
-    //        M[j] /= (float) n;
-    //    }
-    //}
-
-    //// subtract average vector
-    //for (int i = 0; i < n; ++i) {
-    //    for (int j = 0; j < m; ++j) {
-    //        A[i * m + j] -= M[j];
-    //    }
-    //}
-
-    memcpy(A0, A, n * m * sizeof(float));
-
-    // print A
-    printf("A:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", A[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // SVD
-    // A = U * S * V^T
-
-    float * U = (float *) malloc(n * m * sizeof(float));
-    float * S = (float *) malloc(n * sizeof(float));
-    float * V = (float *) malloc(n * n * sizeof(float));
-
-    int lda = m;
-    int ldu = m;
-    int ldvt = n;
-
-    float work_size;
-    int lwork = -1;
-    int info = 0;
-
-    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
-
-    lwork = (int) work_size;
-
-    printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
-
-    float * work = (float *) malloc(lwork * sizeof(float));
-
-    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
-
-    // print U
-    printf("U:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", U[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // normalize S
-    {
-        double sum = 0.0;
-        for (int i = 0; i < n; ++i) {
-            sum += S[i];
-        }
-        sum *= sqrt((double) m);
-        for (int i = 0; i < n; ++i) {
-            S[i] /= sum;
-        }
-    }
-
-    // print S
-    printf("S:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("- %d = %9.5f\n", i, S[i]);
-    }
-    printf("\n");
-
-    // print V
-    printf("V:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < n; ++j) {
-            printf("%9.5f ", V[i * n + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // print A
-    printf("A:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", A[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // compute singular vectors in U
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < m; ++j) {
-            U[i * m + j] *= S[i];
-        }
-    }
-
-    // normalize U
-    for (int i = 0; i < n; ++i) {
-        double sum = 0.0;
-        for (int j = 0; j < m; ++j) {
-            sum += U[i * m + j] * U[i * m + j];
-        }
-        sum = sqrt(sum);
-        for (int j = 0; j < m; ++j) {
-            U[i * m + j] /= sum*sqrt((double) m);
-        }
-    }
-
-    // print U
-    printf("U:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", U[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-
-    // project A0 onto U
-    float * A1 = (float *) malloc(n * n * sizeof(float));
-
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < n; ++j) {
-            A1[i * n + j] = 0.0f;
-            for (int k = 0; k < m; ++k) {
-                A1[i * n + j] += A0[i * m + k] * U[j * m + k];
-            }
-        }
-    }
-
-    // print A1
-    printf("A1:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < n; ++j) {
-            printf("%9.5f ", A1[i * n + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    return 0;
-}
--- a/tests/test2.c
+++ b/tests/test2.c
@ -96,8 +96,8 @@ int main(int argc, const char ** argv) {
        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);

        assert(res == GGML_OPT_OK);
-        assert(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-2f));
-        assert(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-2f));
+        assert(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-3f));
+        assert(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-3f));
    }

    {
Author	SHA1	Message	Date
Georgi Gerganov	1d38a69d7c	t5 : initial load in ggml	3 years ago
Georgi Gerganov	a0f92eff2d	t5 : initial ggml conversion of the model	3 years ago
Georgi Gerganov	ed683187cb	t5 : add example for text-to-text transfer transformer inference	3 years ago