Initial release

3 years ago · b0a11594ae
commit b0a11594ae
10 changed files with 16208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 sync.sh
 main
 *.o
--- a/109
+++ b/109
@ -0,0 +1,109 @@
 main: ggml.o main.o
 	g++ -o main ggml.o main.o
 ggml.o: ggml.c ggml.h
 	gcc -O3 -mavx -mavx2 -mfma -mf16c -c ggml.c
 main.o: main.cpp ggml.h
 	g++ -O3 -std=c++11 -c main.cpp
 # clean up the directory
 clean:
 	rm -f *.o main
 # run the program
 run: main
 	./main
 # download the following audio samples into folder "./samples":
 .PHONY: samples
 samples:
 	@echo "Downloading samples..."
 	mkdir -p samples
 	@wget --quiet --show-progress -O samples/gb0.ogg https://upload.wikimedia.org/wikipedia/commons/2/22/George_W._Bush%27s_weekly_radio_address_%28November_1%2C_2008%29.oga
 	@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
 	@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
 	@echo "Converting to 16-bit WAV ..."
 	@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
 	@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
 	@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
 .PHONY: tiny.en
 tiny.en: main
 	@echo "Downloading tiny.en (75 MB just once)"
 	mkdir -p models
 	@if [ ! -f models/ggml-tiny.en.bin ]; then \
 		wget --quiet --show-progress -O models/ggml-tiny.en.bin https://ggml.ggerganov.com/ggml-model-whisper-tiny.en.bin ; \
 	fi
 	@echo "==============================================="
 	@echo "Running tiny.en on all samples in ./samples ..."
 	@echo "==============================================="
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
 		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-tiny.en.bin -f $$f ; \
 		echo "" ; \
 	done
 .PHONY: base.en
 base.en: main
 	@echo "Downloading base.en (142 MB just once)"
 	mkdir -p models
 	@if [ ! -f models/ggml-base.en.bin ]; then \
 		wget --quiet --show-progress -O models/ggml-base.en.bin https://ggml.ggerganov.com/ggml-model-whisper-base.en.bin ; \
 	fi
 	@echo "==============================================="
 	@echo "Running base.en on all samples in ./samples ..."
 	@echo "==============================================="
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
 		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-base.en.bin -f $$f ; \
 		echo "" ; \
 	done
 .PHONY: small.en
 small.en: main
 	@echo "Downloading small.en (466 MB just once)"
 	mkdir -p models
 	@if [ ! -f models/ggml-small.en.bin ]; then \
 		wget --quiet --show-progress -O models/ggml-small.en.bin https://ggml.ggerganov.com/ggml-model-whisper-small.en.bin ; \
 	fi
 	@echo "==============================================="
 	@echo "Running small.en on all samples in ./samples ..."
 	@echo "==============================================="
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
 		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-small.en.bin -f $$f ; \
 		echo "" ; \
 	done
 .PHONY: medium.en
 medium.en: main
 	@echo "Downloading medium.en (1.5 GB just once)"
 	mkdir -p models
 	@if [ ! -f models/ggml-medium.en.bin ]; then \
 		wget --quiet --show-progress -O models/ggml-medium.en.bin https://ggml.ggerganov.com/ggml-model-whisper-medium.en.bin ; \
 	fi
 	@echo "==============================================="
 	@echo "Running medium.en on all samples in ./samples ..."
 	@echo "==============================================="
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
 		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-medium.en.bin -f $$f ; \
 		echo "" ; \
 	done
--- a/convert-pt-to-ggml.py
+++ b/convert-pt-to-ggml.py
@ -0,0 +1,328 @@
 # Convert Whisper transformer model from PyTorch to ggml format
 #
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
 #
 # You need to clone the original repo in ~/path/to/repo/whisper/
 #
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
 #
 # It is used to various assets needed by the algorithm:
 #
 #  - tokenizer
 #  - mel filters
 #
 # Also, you need to have the original models in ~/.cache/whisper/
 # See the original repo for more details.
 #
 # This script loads the specified model and whisper assets and saves them in ggml format.
 # The output is a single binary file containing the following information:
 #
 #  - hparams
 #  - mel filters
 #  - tokenizer vocab
 #  - model variables
 #
 # For each variable, write the following:
 #
 #  - Number of dimensions (int)
 #  - Name length (int)
 #  - Dimensions (int[n_dims])
 #  - Name (char[name_length])
 #  - Data (float[n_dims])
 #
 import io
 import os
 import sys
 import struct
 import json
 import code
 import torch
 import numpy as np
 from transformers import GPTJForCausalLM
 from transformers import GPT2TokenizerFast
 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 LANGUAGES = {
    "en": "english",
    "zh": "chinese",
    "de": "german",
    "es": "spanish",
    "ru": "russian",
    "ko": "korean",
    "fr": "french",
    "ja": "japanese",
    "pt": "portuguese",
    "tr": "turkish",
    "pl": "polish",
    "ca": "catalan",
    "nl": "dutch",
    "ar": "arabic",
    "sv": "swedish",
    "it": "italian",
    "id": "indonesian",
    "hi": "hindi",
    "fi": "finnish",
    "vi": "vietnamese",
    "iw": "hebrew",
    "uk": "ukrainian",
    "el": "greek",
    "ms": "malay",
    "cs": "czech",
    "ro": "romanian",
    "da": "danish",
    "hu": "hungarian",
    "ta": "tamil",
    "no": "norwegian",
    "th": "thai",
    "ur": "urdu",
    "hr": "croatian",
    "bg": "bulgarian",
    "lt": "lithuanian",
    "la": "latin",
    "mi": "maori",
    "ml": "malayalam",
    "cy": "welsh",
    "sk": "slovak",
    "te": "telugu",
    "fa": "persian",
    "lv": "latvian",
    "bn": "bengali",
    "sr": "serbian",
    "az": "azerbaijani",
    "sl": "slovenian",
    "kn": "kannada",
    "et": "estonian",
    "mk": "macedonian",
    "br": "breton",
    "eu": "basque",
    "is": "icelandic",
    "hy": "armenian",
    "ne": "nepali",
    "mn": "mongolian",
    "bs": "bosnian",
    "kk": "kazakh",
    "sq": "albanian",
    "sw": "swahili",
    "gl": "galician",
    "mr": "marathi",
    "pa": "punjabi",
    "si": "sinhala",
    "km": "khmer",
    "sn": "shona",
    "yo": "yoruba",
    "so": "somali",
    "af": "afrikaans",
    "oc": "occitan",
    "ka": "georgian",
    "be": "belarusian",
    "tg": "tajik",
    "sd": "sindhi",
    "gu": "gujarati",
    "am": "amharic",
    "yi": "yiddish",
    "lo": "lao",
    "uz": "uzbek",
    "fo": "faroese",
    "ht": "haitian creole",
    "ps": "pashto",
    "tk": "turkmen",
    "nn": "nynorsk",
    "mt": "maltese",
    "sa": "sanskrit",
    "lb": "luxembourgish",
    "my": "myanmar",
    "bo": "tibetan",
    "tl": "tagalog",
    "mg": "malagasy",
    "as": "assamese",
    "tt": "tatar",
    "haw": "hawaiian",
    "ln": "lingala",
    "ha": "hausa",
    "ba": "bashkir",
    "jw": "javanese",
    "su": "sundanese",
 }
 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
 def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
    tokenizer = GPT2TokenizerFast.from_pretrained(path)
    specials = [
        "<|startoftranscript|>",
        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
        "<|translate|>",
        "<|transcribe|>",
        "<|startoflm|>",
        "<|startofprev|>",
        "<|nocaptions|>",
        "<|notimestamps|>",
    ]
    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
    return tokenizer
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 if len(sys.argv) < 4:
    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
    sys.exit(1)
 fname_inp   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]
 # try to load PyTorch binary data
 try:
    model_bytes = open(fname_inp, "rb").read()
    with io.BytesIO(model_bytes) as fp:
        checkpoint = torch.load(fp, map_location="cpu")
 except:
    print("Error: failed to load PyTorch model file: %s" % fname_inp)
    sys.exit(1)
 hparams = checkpoint["dims"]
 print("hparams:", hparams)
 list_vars = checkpoint["model_state_dict"]
 #print(list_vars['encoder.positional_embedding'])
 #print(list_vars['encoder.conv1.weight'])
 #print(list_vars['encoder.conv1.weight'].shape)
 # load mel filters
 n_mels = hparams["n_mels"]
 with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
    filters = torch.from_numpy(f[f"mel_{n_mels}"])
    #print (filters)
 #code.interact(local=locals())
 multilingual = hparams["n_vocab"] == 51865
 tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
 #print(tokenizer)
 #print(tokenizer.name_or_path)
 #print(len(tokenizer.additional_special_tokens))
 dir_tokenizer = tokenizer.name_or_path
 # output in the same directory as the model
 fname_out = dir_out + "/ggml-model.bin"
 with open(dir_tokenizer + "/vocab.json", "r") as f:
    tokens = json.load(f)
 # use 16-bit or 32-bit floats
 use_f16 = True
 if len(sys.argv) > 4:
    use_f16 = False
    fname_out = dir_out + "/ggml-model-f32.bin"
 fout = open(fname_out, "wb")
 fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 fout.write(struct.pack("i", hparams["n_vocab"]))
 fout.write(struct.pack("i", hparams["n_audio_ctx"]))
 fout.write(struct.pack("i", hparams["n_audio_state"]))
 fout.write(struct.pack("i", hparams["n_audio_head"]))
 fout.write(struct.pack("i", hparams["n_audio_layer"]))
 fout.write(struct.pack("i", hparams["n_text_ctx"]))
 fout.write(struct.pack("i", hparams["n_text_state"]))
 fout.write(struct.pack("i", hparams["n_text_head"]))
 fout.write(struct.pack("i", hparams["n_text_layer"]))
 fout.write(struct.pack("i", hparams["n_mels"]))
 fout.write(struct.pack("i", use_f16))
 # write mel filters
 fout.write(struct.pack("i", filters.shape[0]))
 fout.write(struct.pack("i", filters.shape[1]))
 for i in range(filters.shape[0]):
    for j in range(filters.shape[1]):
        fout.write(struct.pack("f", filters[i][j]))
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v:k for k, v in byte_encoder.items()}
 fout.write(struct.pack("i", len(tokens)))
 for key in tokens:
    text = bytearray([byte_decoder[c] for c in key]).decode('utf-8', errors='replace').encode('utf-8')
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable: " + name + " with shape: ", data.shape)
    # reshape conv bias from [n] to [n, 1]
    if name == "encoder.conv1.bias" or \
       name == "encoder.conv2.bias":
        data = data.reshape(data.shape[0], 1)
        print("  Reshaped variable: " + name + " to shape: ", data.shape)
    n_dims = len(data.shape);
    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype = 1;
    if use_f16:
        if n_dims < 2 or \
                name == "encoder.conv1.bias"   or \
                name == "encoder.conv2.bias"   or \
                name == "encoder.positional_embedding" or \
                name == "decoder.positional_embedding":
            ftype = 0
            data = data.astype(np.float32)
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype = 0
    else:
        data = data.astype(np.float32)
        ftype = 0
    #if name.startswith("encoder"):
    #    if name.endswith("mlp.0.weight") or \
    #       name.endswith("mlp.2.weight"):
    #        print("  Transposing")
    #        data = data.transpose()
    # header
    str = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);
    # data
    data.tofile(fout)
 fout.close()
 print("Done. Output file: " + fname_out)
 print("")
--- a/dr_wav.h
+++ b/dr_wav.h
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -0,0 +1,527 @@
 #pragma once
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 #define GGML_MAX_DIMS     4
 #define GGML_MAX_NODES    4096
 #define GGML_MAX_PARAMS   16
 #define GGML_MAX_CONTEXTS 16
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
 typedef __fp16 ggml_fp16_t;
 #else
 typedef uint16_t ggml_fp16_t;
 #endif
 float ggml_fp16_to_fp32(ggml_fp16_t x);
 ggml_fp16_t ggml_fp32_to_fp16(float x);
 struct ggml_object;
 struct ggml_context;
 enum ggml_type {
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
    GGML_TYPE_F16,
    GGML_TYPE_F32,
    GGML_TYPE_COUNT,
 };
 enum ggml_op {
    GGML_OP_NONE = 0,
    GGML_OP_DUP,
    GGML_OP_ADD,
    GGML_OP_SUB,
    GGML_OP_MUL,
    GGML_OP_DIV,
    GGML_OP_SQR,
    GGML_OP_SQRT,
    GGML_OP_SUM,
    GGML_OP_MEAN,
    GGML_OP_REPEAT,
    GGML_OP_ABS,
    GGML_OP_SGN,
    GGML_OP_NEG,
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
    GGML_OP_NORM, // normalize
    GGML_OP_MUL_MAT,
    GGML_OP_SCALE,
    GGML_OP_CPY,
    GGML_OP_RESHAPE,
    GGML_OP_VIEW,
    GGML_OP_PERMUTE,
    GGML_OP_TRANSPOSE,
    GGML_OP_GET_ROWS,
    GGML_OP_DIAG_MASK_INF,
    GGML_OP_SOFT_MAX,
    GGML_OP_ROPE,
    GGML_OP_CONV_1D_1S,
    GGML_OP_CONV_1D_2S,
    GGML_OP_COUNT,
 };
 // n-dimensional tensor
 struct ggml_tensor {
    enum ggml_type type;
    int    n_dims;
    int    ne[GGML_MAX_DIMS]; // number of elements
    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
                              // nb[0] = sizeof(type)
                              // nb[1] = nb[0]   * ne[0] + padding
                              // nb[i] = nb[i-1] * ne[i-1]
    // compute data
    enum ggml_op op;
    bool is_param;
    struct ggml_tensor * grad;
    struct ggml_tensor * src0;
    struct ggml_tensor * src1;
    // thread scheduling
    int n_tasks;
    // performance
    int     perf_runs;
    int64_t perf_cycles;
    int64_t perf_time_us;
    void * data;
    char pad[8];
 };
 // computation graph
 struct ggml_cgraph {
    int n_nodes;
    int n_leafs;
    int n_threads;
    size_t work_size;
    struct ggml_tensor * work;
    struct ggml_tensor * nodes[GGML_MAX_NODES];
    struct ggml_tensor * grads[GGML_MAX_NODES];
    struct ggml_tensor * leafs[GGML_MAX_NODES];
    // performance
    int     perf_runs;
    int64_t perf_cycles;
    int64_t perf_time_us;
 };
 struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
 };
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 int64_t ggml_cycles(void);
 int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);
 size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 struct ggml_context * ggml_init(struct ggml_init_params params);
 void ggml_free(struct ggml_context * ctx);
 size_t ggml_used_mem(const struct ggml_context * ctx);
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
        const int *ne);
 struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0);
 struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1);
 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1,
        int    ne2);
 struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1,
        int    ne2,
        int    ne3);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
 struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
 struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
 struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
 void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 void * ggml_get_data    (const struct ggml_tensor * tensor);
 float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 //
 // operations on tensors with backpropagation
 //
 struct ggml_tensor * ggml_dup(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_add(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_mul(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_div(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_sqr(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_sqrt(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // return scalar
 // TODO: compute sum along rows
 struct ggml_tensor * ggml_sum(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // mean along rows
 struct ggml_tensor * ggml_mean(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // if a is the same shape as b, and a is not parameter, return a
 // otherwise, return a new tensor: repeat(a) to fit in b
 struct ggml_tensor * ggml_repeat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_abs(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_sgn(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_neg(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_step(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // TODO: double-check this computation is correct
 struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
 struct ggml_tensor * ggml_mul_mat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 //
 // operations on tensors without backpropagation
 //
 // in-place, returns view(a)
 struct ggml_tensor * ggml_scale(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 // a -> b, return view(b)
 struct ggml_tensor * ggml_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 // return view(a), b specifies the new shape
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1);
 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        int                   ne2);
 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        size_t                offset);
 struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);
 struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   axis0,
        int                   axis1,
        int                   axis2,
        int                   axis3);
 // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
 struct ggml_tensor * ggml_transpose(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_get_rows(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 // set elements above the diagonal to -INF
 // in-place, returns view(a)
 struct ggml_tensor * ggml_diag_mask_inf(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past);
 // in-place, returns view(a)
 struct ggml_tensor * ggml_soft_max(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // rotary position embedding
 // in-place, returns view(a)
 // if mode == 1, skip n_past elements
 // TODO: avoid creating a new tensor every time
 struct ggml_tensor * ggml_rope(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        int                   n_dims,
        int                   mode);
 // padding = 1
 // TODO: we don't support extra parameters for now
 //       that's why we are hard-coding the stride, padding, and dilation
 //       not great ..
 struct ggml_tensor * ggml_conv_1d_1s(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_conv_1d_2s(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 //
 // automatic differentiation
 //
 void ggml_set_param(
        struct ggml_context * ctx,
        struct ggml_tensor * tensor);
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
 struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
 void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 // print info and performance information for the graph
 void ggml_graph_print(const struct ggml_cgraph * cgraph);
 // dump the graph into a file using the dot format
 void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 //
 // optimization
 //
 // optimization methods
 enum ggml_opt_type {
    GGML_OPT_ADAM,
    GGML_OPT_LBFGS,
 };
 // linesearch methods
 enum ggml_linesearch {
    GGML_LINESEARCH_DEFAULT = 1,
    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
 };
 // optimization return values
 enum ggml_opt_result {
    GGML_OPT_OK = 0,
    GGML_OPT_DID_NOT_CONVERGE,
    GGML_OPT_NO_CONTEXT,
    GGML_OPT_INVALID_WOLFE,
    GGML_OPT_FAIL,
    GGML_LINESEARCH_FAIL = -128,
    GGML_LINESEARCH_MINIMUM_STEP,
    GGML_LINESEARCH_MAXIMUM_STEP,
    GGML_LINESEARCH_MAXIMUM_ITERATIONS,
    GGML_LINESEARCH_INVALID_PARAMETERS,
 };
 // optimization parameters
 //
 //   see ggml.c (ggml_opt_default_params) for default values
 //
 struct ggml_opt_params {
    enum ggml_opt_type type;
    int n_threads;
    // delta-based convergence test
    //
    //   if past == 0 - disabled
    //   if past > 0:
    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
    //
    int past;
    float delta;
    // maximum number of iterations without improvement
    //
    //   if 0 - disabled
    //   if > 0:
    //     assume convergence if no cost improvement in this number of iterations
    //
    int max_no_improvement;
    bool print_forward_graph;
    bool print_backward_graph;
    union {
        // ADAM parameters
        struct {
            int n_iter;
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
        } adam;
        // LBFGS parameters
        struct {
            int m; // number of corrections to approximate the inv. Hessian
            int n_iter;
            int max_linesearch;
            float eps;      // convergence tolerance
            float ftol;     // line search tolerance
            float wolfe;
            float min_step;
            float max_step;
            enum ggml_linesearch linesearch;
        } lbfgs;
    };
 };
 struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
 // optimize the function defined by the tensor f
 enum ggml_opt_result ggml_opt(
        struct ggml_context * ctx,
        struct ggml_opt_params params,
        struct ggml_tensor * f);
 #ifdef  __cplusplus
 }
 #endif
--- a/main.cpp
+++ b/main.cpp
--- a/models/.gitignore
+++ b/models/.gitignore
@ -0,0 +1 @@
 *.bin
--- a/samples/.gitignore
+++ b/samples/.gitignore
@ -0,0 +1 @@
 *
--- a/samples/jfk.wav
+++ b/samples/jfk.wav