Compare commits

..

3 Commits
master ... t5

1
.gitignore vendored

@ -9,4 +9,3 @@ compile_commands.json
.DS_Store
src/arm_neon.h
tests/arm_neon.h

@ -47,7 +47,6 @@ endif()
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
# dependencies

@ -2,9 +2,6 @@
Tensor library for machine learning
***Note that this project is under development and not ready for production use. \
Some of the development is currently happening in the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repo***
## Features
- Written in C
@ -16,15 +13,9 @@ Some of the development is currently happening in the [whisper.cpp](https://gith
- No third-party dependencies
- Zero memory allocations during runtime
## Roadmap
- [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
- [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
- [ ] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
- [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
- [ ] Example of LLaMA inference
- [ ] Example of RWKV inference
***Note that this project is under development and not ready for production use.
Most of the development is currently happening in the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repo,
so if you are interested in this project, make sure to follow what is happening there.***
## Whisper inference (example)

@ -4,3 +4,4 @@ target_include_directories(ggml_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_subdirectory(gpt-2)
add_subdirectory(gpt-j)
add_subdirectory(whisper)
add_subdirectory(t5)

@ -347,7 +347,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
// - embd_w: the predicted probabilities of the next token
//
bool gpt2_eval(
const gpt2_model & model,
@ -496,6 +496,7 @@ bool gpt2_eval(
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
#if 0
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@ -503,6 +504,15 @@ bool gpt2_eval(
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
#else
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
#endif
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
@ -627,7 +637,7 @@ bool gpt2_eval(
inpL = ggml_mul_mat(ctx0, model.wte, inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
@ -641,7 +651,7 @@ bool gpt2_eval(
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result just for the last token
// return result for just the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
@ -698,7 +708,7 @@ int main(int argc, char ** argv) {
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
std::vector<float> logits;
std::vector<float> embd_w;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
@ -714,14 +724,14 @@ int main(int argc, char ** argv) {
// determine the required inference memory per token:
size_t mem_per_token = 0;
gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, embd_w, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
if (!gpt2_eval(model, params.n_threads, n_past, embd, embd_w, mem_per_token)) {
printf("Failed to predict\n");
return 1;
}
@ -745,7 +755,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_sample_us = ggml_time_us();
id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
id = gpt_sample_top_k_top_p(vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, rng);
t_sample_us += ggml_time_us() - t_start_sample_us;
}

@ -214,11 +214,8 @@ make -j4 gpt-j
```
To run the `gpt-j` tool, you need the 12GB `ggml-model.bin` file which contains the GPT-J model in
[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, the binary file
is downloaded from my repository on Hugging Face using the [download-ggml-model.sh](download-ggml-model.sh) script.
You can also, download the file manually from this link:
https://huggingface.co/datasets/ggerganov/ggml/tree/main
[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, I download the binary file
directly from one of my servers, using the [download-ggml-model.sh](download-ggml-model.sh) script.
---

@ -355,7 +355,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
// - embd_w: the predicted probabilities of the next token
//
// The GPT-J model requires about 16MB of memory per input token.
//
@ -559,7 +559,7 @@ bool gptj_eval(
}
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
std::vector<float> logits;
std::vector<float> embd_w;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
@ -644,14 +644,14 @@ int main(int argc, char ** argv) {
// determine the required inference memory per token:
size_t mem_per_token = 0;
gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, embd_w, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
if (!gptj_eval(model, params.n_threads, n_past, embd, embd_w, mem_per_token)) {
printf("Failed to predict\n");
return 1;
}
@ -675,7 +675,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_sample_us = ggml_time_us();
id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
id = gpt_sample_top_k_top_p(vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, rng);
t_sample_us += ggml_time_us() - t_start_sample_us;
}

@ -0,0 +1,6 @@
#
# t5
set(TEST_TARGET t5)
add_executable(${TEST_TARGET} main.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ggml_utils)

@ -0,0 +1,3 @@
# t5
ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py

@ -0,0 +1,119 @@
import io
import sys
import torch
import json
import struct
import numpy
import code # tmp
#from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
if len(sys.argv) < 3:
print("Usage: convert-flan-t5-pt-to-ggml.py path-to-pt-model dir-output [use-f32]\n")
sys.exit(1)
dir_inp = sys.argv[1]
dir_out = sys.argv[2]
fname_inp = dir_inp + "/pytorch_model.bin"
fname_out = dir_out + "/ggml-t5-model.bin"
fname_config = dir_inp + "/config.json"
# use 16-bit or 32-bit floats
use_f16 = True
if len(sys.argv) > 3:
use_f16 = False
fname_out = dir_out + "/ggml-t5-model-f32.bin"
# load torch model
try:
model_bytes = open(fname_inp, "rb").read()
with io.BytesIO(model_bytes) as fp:
checkpoint = torch.load(fp, map_location="cpu")
except:
print("Error: failed to load PyTorch model file: %s" % fname_inp)
sys.exit(1)
# load config (json)
config = json.load(open(fname_config, "r"))
# list all keys
for k in checkpoint.keys():
print(k)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
# list methods of tokenizer
for m in dir(tokenizer):
print(m)
print(config)
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", config["vocab_size"]))
fout.write(struct.pack("i", config["d_ff"]))
fout.write(struct.pack("i", config["d_kv"]))
fout.write(struct.pack("i", config["d_model"]))
fout.write(struct.pack("i", config["n_positions"]))
fout.write(struct.pack("i", config["num_heads"]))
fout.write(struct.pack("i", config["num_layers"]))
fout.write(struct.pack("i", use_f16))
# sort tokenizer.vocab by value
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1])
fout.write(struct.pack("i", len(tokens)))
print("tokens: %d" % len(tokens))
for key in tokens:
# TODO: this probably is wrong, but it should work for english at least
token = key[0].replace("", " ")
text = bytearray(token, "utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
# tokenize "hello world"
#print(tokenizer.encode("Hello hello world.Hello-Hello"))
#print(tokenizer("добър ден", return_tensors="pt"))
# dump weights
for k in checkpoint.keys():
data = checkpoint[k].squeeze().numpy()
name = k
n_dims = len(data.shape)
print(name, n_dims, data.shape)
ftype = 1;
if use_f16:
if n_dims < 2:
print(" Converting to float32")
ftype = 0
else:
print(" Converting to float16")
data = data.astype(numpy.float16)
ftype = 1
else:
ftype = 0
# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str);
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " + fname_out)
print("")
#code.interact(local=locals())

@ -0,0 +1,740 @@
#include "ggml.h"
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
// available t5 models
enum e_model {
MODEL_UNKNOWN,
MODEL_SMALL,
MODEL_BASE,
MODEL_LARGE,
MODEL_XL,
MODEL_XXL,
};
static const size_t MB = 4*1024*1024;
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
{ MODEL_SMALL, 74ull*MB },
{ MODEL_BASE, 142ull*MB },
{ MODEL_LARGE, 466ull*MB },
{ MODEL_XL, 1464ull*MB },
{ MODEL_XXL, 2952ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_MEMORY = {
{ MODEL_SMALL, 12ull*MB },
{ MODEL_BASE, 24ull*MB },
{ MODEL_LARGE, 70ull*MB },
{ MODEL_XL, 184ull*MB },
{ MODEL_XXL, 306ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
{ MODEL_SMALL, 80ull*MB },
{ MODEL_BASE, 128ull*MB },
{ MODEL_LARGE, 300ull*MB },
{ MODEL_XL, 680ull*MB },
{ MODEL_XXL, 1100ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
{ MODEL_SMALL, 104ull*MB },
{ MODEL_BASE, 138ull*MB },
{ MODEL_LARGE, 208ull*MB },
{ MODEL_XL, 280ull*MB },
{ MODEL_XXL, 354ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
{ MODEL_SMALL, 200ull*MB },
{ MODEL_BASE, 202ull*MB },
{ MODEL_LARGE, 204ull*MB },
{ MODEL_XL, 206ull*MB },
{ MODEL_XXL, 208ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_DECODE_LAYER = {
{ MODEL_SMALL, 32ull*MB },
{ MODEL_BASE, 44ull*MB },
{ MODEL_LARGE, 64ull*MB },
{ MODEL_XL, 84ull*MB },
{ MODEL_XXL, 110ull*MB },
};
struct t5_vocab {
using id = int32_t;
using token = std::string;
int n_vocab = 32128;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
};
// default hparams (FLAN-T5 Small)
struct t5_hparams {
int32_t n_vocab = 32128;
int32_t d_ff = 1024;
int32_t d_kv = 64;
int32_t d_model = 512;
int32_t n_positions = 512;
int32_t n_head = 6;
int32_t n_layer = 8;
int32_t f16 = 1;
};
struct t5_layer_encoder {
// encoder.block.*.layer.0.SelfAttention
struct ggml_tensor * attn_q;
struct ggml_tensor * attn_k;
struct ggml_tensor * attn_v;
struct ggml_tensor * attn_o;
// encoder.blocks.*.layer.0.layer_norm
struct ggml_tensor * ln_0;
// encoder.blocks.*.layer.1.DenseReluDense
struct ggml_tensor * wi_0;
struct ggml_tensor * wi_1;
struct ggml_tensor * wo;
// encoder.blocks.*.layer.1.layer_norm
struct ggml_tensor * ln_1;
};
struct t5_layer_decoder {
// decoder.block.*.layer.0.SelfAttention
struct ggml_tensor * attn_q;
struct ggml_tensor * attn_k;
struct ggml_tensor * attn_v;
struct ggml_tensor * attn_o;
// decoder.blocks.*.layer.0.layer_norm
struct ggml_tensor * ln_0;
// decoder.blocks.*.layer.1.EncDecAttention
struct ggml_tensor * cross_attn_q;
struct ggml_tensor * cross_attn_k;
struct ggml_tensor * cross_attn_v;
struct ggml_tensor * cross_attn_o;
// decoder.blocks.*.layer.1.layer_norm
struct ggml_tensor * ln_1;
// decoder.blocks.*.layer.1.DenseReluDense
struct ggml_tensor * wi_0;
struct ggml_tensor * wi_1;
struct ggml_tensor * wo;
// decoder.blocks.*.layer.1.layer_norm
struct ggml_tensor * ln_2;
};
struct t5_model {
e_model type = MODEL_UNKNOWN;
t5_hparams hparams;
// shared
struct ggml_tensor * shared;
// encoder.embed_tokens
struct ggml_tensor * e_et;
// encoder.final_layer_norm
struct ggml_tensor * e_ln;
// encoder.block.0.layer.0.SelfAttention.relative_attention_bias
struct ggml_tensor * e_rab;
// decoder.embed_tokens
struct ggml_tensor * d_et;
// decoder.final_layer_norm
struct ggml_tensor * d_ln;
// decoder.block.0.layer.0.SelfAttention.relative_attention_bias
struct ggml_tensor * d_rab;
// lm_head
struct ggml_tensor * lm_head;
std::vector<t5_layer_encoder> layers_encoder;
std::vector<t5_layer_decoder> layers_decoder;
// context
struct ggml_context * ctx;
struct ggml_context * ctx_mem;
// tensors
int n_loaded;
std::map<std::string, struct ggml_tensor *> tensors;
};
struct t5_context {
int64_t t_load_us = 0;
int64_t t_sample_us = 0;
int64_t t_encode_us = 0;
int64_t t_decode_us = 0;
int64_t t_start_us = 0;
std::vector<uint8_t> buf_model;
std::vector<uint8_t> buf_memory;
std::vector<uint8_t> buf_compute;
std::vector<uint8_t> buf_compute_layer;
t5_model model;
t5_vocab vocab;
std::vector<float> probs;
std::vector<float> logits;
};
template<typename T>
static void read_safe(std::ifstream& fin, T& dest) {
fin.read((char*)& dest, sizeof(T));
}
static bool t5_model_load(const std::string & fname, t5_context & wctx) {
fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
auto & model = wctx.model;
auto & vocab = wctx.vocab;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
// verify magic
{
uint32_t magic;
read_safe(fin, magic);
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}
}
//load hparams
{
auto & hparams = model.hparams;
read_safe(fin, hparams.n_vocab);
read_safe(fin, hparams.d_ff);
read_safe(fin, hparams.d_kv);
read_safe(fin, hparams.d_model);
read_safe(fin, hparams.n_positions);
read_safe(fin, hparams.n_head);
read_safe(fin, hparams.n_layer);
read_safe(fin, hparams.f16);
assert(hparams.n_text_state == hparams.n_audio_state);
if (hparams.n_layer == 8) {
model.type = e_model::MODEL_SMALL;
}
if (hparams.n_layer == 12) {
model.type = e_model::MODEL_BASE;
}
if (hparams.n_layer == 24 && hparams.n_head == 16) {
model.type = e_model::MODEL_LARGE;
}
if (hparams.n_layer == 24 && hparams.n_head == 32) {
model.type = e_model::MODEL_XL;
}
if (hparams.n_layer == 24 && hparams.n_head == 64) {
model.type = e_model::MODEL_XXL;
}
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: d_ff = %d\n", __func__, hparams.d_ff);
fprintf(stderr, "%s: d_kv = %d\n", __func__, hparams.d_kv);
fprintf(stderr, "%s: d_model = %d\n", __func__, hparams.d_model);
fprintf(stderr, "%s: n_positions = %d\n", __func__, hparams.n_positions);
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
}
// load vocab
{
int32_t n_vocab = 0;
read_safe(fin, n_vocab);
//if (n_vocab != model.hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
// return false;
//}
std::string word;
std::vector<char> tmp;
tmp.reserve(128);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
read_safe(fin, len);
if (len > 0) {
tmp.resize(len);
fin.read(&tmp[0], tmp.size()); // read to buffer
word.assign(&tmp[0], tmp.size());
} else {
// seems like we have an empty-string token in multi-language models (i = 50256)
//fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
word = "";
}
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
//printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
}
vocab.n_vocab = model.hparams.n_vocab;
if (n_vocab < model.hparams.n_vocab) {
fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
word = "[_extra_token_" + std::to_string(i) + "]";
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
wctx.logits.reserve(vocab.n_vocab*model.hparams.d_model);
wctx.probs.reserve(vocab.n_vocab*model.hparams.d_model);
}
{
// this is the total memory required to run the inference
const size_t mem_required =
wctx.buf_model.size() +
wctx.buf_memory.size() +
wctx.buf_compute.size() +
wctx.buf_compute_layer.size();
fprintf(stderr, "%s: mem_required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
}
// for the big tensors, we have the option to store the data in 16-bit floats
// in order to save memory and also to speed up the computation
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_vocab = hparams.n_vocab;
const int d_ff = hparams.d_ff;
const int d_kv = hparams.d_kv;
const int d_model = hparams.d_model;
const int n_head = hparams.n_head;
const int n_layer = hparams.n_layer;
ctx_size += n_vocab*d_model*ggml_type_size(wtype); // shared;
ctx_size += n_vocab*d_model*ggml_type_size(wtype); // lm_head;
// encoder
{
ctx_size += n_vocab*d_model*ggml_type_size(wtype); // e_et;
ctx_size += d_model*ggml_type_size(GGML_TYPE_F32); // e_ln
ctx_size += 32*n_head*ggml_type_size(wtype); // e_rab
}
// decoder
{
ctx_size += n_vocab*d_model*ggml_type_size(wtype); // d_et;
ctx_size += d_model*ggml_type_size(GGML_TYPE_F32); // d_ln
ctx_size += 32*n_head*ggml_type_size(wtype); // d_rab
}
// encoder layers
{
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o
ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo
ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1
}
// decoder layers
{
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_q
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_k
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_v
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // attn_o
ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_0
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_q
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_k
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_v
ctx_size += n_layer*(d_kv*n_head*d_model*ggml_type_size(wtype)); // cross_attn_o
ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_1
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_0
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wi_1
ctx_size += n_layer*(d_ff*d_model*ggml_type_size(wtype)); // wo
ctx_size += n_layer*(d_model*ggml_type_size(GGML_TYPE_F32)); // ln_2
}
ctx_size += (15 + 9*n_layer + 14*n_layer)*256; // object overhead
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = wctx.buf_model.size();
params.mem_buffer = wctx.buf_model.data();
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// prepare memory for the weights
{
auto & ctx = model.ctx;
const auto & hparams = model.hparams;
const int n_vocab = hparams.n_vocab;
const int d_ff = hparams.d_ff;
const int d_kv = hparams.d_kv;
const int d_model = hparams.d_model;
const int n_head = hparams.n_head;
const int n_layer = hparams.n_layer;
model.layers_encoder.resize(n_layer);
model.layers_decoder.resize(n_layer);
// global
{
model.shared = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
model.lm_head = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
model.tensors["shared.weight"] = model.shared;
model.tensors["lm_head.weight"] = model.lm_head;
}
// encoder
{
model.e_et = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
model.e_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
model.e_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32);
// map by name
model.tensors["encoder.embed_tokens.weight"] = model.e_et;
model.tensors["encoder.final_layer_norm.weight"] = model.e_ln;
model.tensors["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.e_rab;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers_encoder[i];
layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
layer.wo = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model);
layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
// map by name
model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q;
model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k;
model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v;
model.tensors["encoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o;
model.tensors["encoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0;
model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_0.weight"] = layer.wi_0;
model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wi_1.weight"] = layer.wi_1;
model.tensors["encoder.block." + std::to_string(i) + ".layer.1.DenseReluDense.wo.weight"] = layer.wo;
model.tensors["encoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1;
}
}
// decoder
{
model.d_et = ggml_new_tensor_2d(ctx, wtype, d_model, n_vocab);
model.d_ln = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
model.d_rab = ggml_new_tensor_2d(ctx, wtype, n_head, 32);
// map by name
model.tensors["decoder.embed_tokens.weight"] = model.d_et;
model.tensors["decoder.final_layer_norm.weight"] = model.d_ln;
model.tensors["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = model.d_rab;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers_decoder[i];
layer.attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
layer.ln_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
layer.cross_attn_q = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.cross_attn_k = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.cross_attn_v = ggml_new_tensor_2d(ctx, wtype, d_model, d_kv*n_head);
layer.cross_attn_o = ggml_new_tensor_2d(ctx, wtype, d_kv*n_head, d_model);
layer.ln_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
layer.wi_0 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
layer.wi_1 = ggml_new_tensor_2d(ctx, wtype, d_model, d_ff);
layer.wo = ggml_new_tensor_2d(ctx, wtype, d_ff, d_model);
layer.ln_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d_model);
// map by name
model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.q.weight"] = layer.attn_q;
model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.k.weight"] = layer.attn_k;
model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.v.weight"] = layer.attn_v;
model.tensors["decoder.block." + std::to_string(i) + ".layer.0.SelfAttention.o.weight"] = layer.attn_o;
model.tensors["decoder.block." + std::to_string(i) + ".layer.0.layer_norm.weight"] = layer.ln_0;
model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.q.weight"] = layer.cross_attn_q;
model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.k.weight"] = layer.cross_attn_k;
model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.v.weight"] = layer.cross_attn_v;
model.tensors["decoder.block." + std::to_string(i) + ".layer.1.EncDecAttention.o.weight"] = layer.cross_attn_o;
model.tensors["decoder.block." + std::to_string(i) + ".layer.1.layer_norm.weight"] = layer.ln_1;
model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_0.weight"] = layer.wi_0;
model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wi_1.weight"] = layer.wi_1;
model.tensors["decoder.block." + std::to_string(i) + ".layer.2.DenseReluDense.wo.weight"] = layer.wo;
model.tensors["decoder.block." + std::to_string(i) + ".layer.2.layer_norm.weight"] = layer.ln_2;
}
}
}
// create the ggml memory context
{
struct ggml_init_params params;
params.mem_size = wctx.buf_memory.size();
params.mem_buffer = wctx.buf_memory.data();
model.ctx_mem = ggml_init(params);
if (!model.ctx_mem) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// key + value memory
//{
// auto & ctx = model.ctx_mem;
// const auto & hparams = model.hparams;
// const int n_text_state = hparams.n_text_state;
// const int n_text_layer = hparams.n_text_layer;
// const int n_text_ctx = hparams.n_text_ctx;
// // key/value memory for the self-attention layer
// {
// const int n_mem = n_text_layer*n_text_ctx;
// const int n_elements = n_text_state*n_mem;
// model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
// model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
// }
// // key/value memory for the cross-attention layer
// {
// const int n_audio_ctx = hparams.n_audio_ctx;
// const int n_mem = n_text_layer*n_audio_ctx;
// const int n_elements = n_text_state*n_mem;
// model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
// model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
// }
// const size_t memory_size =
// ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
// ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
// fprintf(stderr, "%s: memory size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
//}
// load weights
{
size_t total_size = 0;
model.n_loaded = 0;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
read_safe(fin, n_dims);
read_safe(fin, length);
read_safe(fin, ftype);
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[3] = { 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
read_safe(fin, ne[i]);
nelements *= ne[i];
}
std::string name;
std::vector<char> tmp(length); // create a buffer
fin.read(&tmp[0], tmp.size()); // read to buffer
name.assign(&tmp[0], tmp.size());
if (model.tensors.find(name) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
return false;
}
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
if (nelements*bpe != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
model.n_loaded++;
}
fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
return false;
}
}
fin.close();
return true;
}
struct t5_context * t5_init(const char * path_model) {
ggml_time_init();
t5_context * ctx = new t5_context;
const int64_t t_start_us = ggml_time_us();
ctx->t_start_us = t_start_us;
if (!t5_model_load(path_model, *ctx)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
delete ctx;
return nullptr;
}
ctx->t_load_us = ggml_time_us() - t_start_us;
return ctx;
}
void t5_free(struct t5_context * ctx) {
if (ctx) {
if (ctx->model.ctx) {
ggml_free(ctx->model.ctx);
}
if (ctx->model.ctx_mem) {
ggml_free(ctx->model.ctx_mem);
}
delete ctx;
}
}
int main(int argc, char ** argv) {
if (argc < 2) {
fprintf(stderr, "usage: %s <model>\n", argv[0]);
return -1;
}
const char * path_model = argv[1];
t5_context * ctx = t5_init(path_model);
if (!ctx) {
fprintf(stderr, "%s: failed to initialize T5 context\n", __func__);
return -1;
}
fprintf(stderr, "%s: model loaded in %7.2f ms\n", __func__, ctx->t_load_us/1000.0);
t5_free(ctx);
return 0;
}

@ -261,11 +261,8 @@ gpt_vocab::id gpt_sample_top_k_top_p(
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const double scale = 1.0/temp;
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
for (int i = 0; i < n_logits; i++) {
logits_id.push_back(std::make_pair(logits[i], i));
}
// find the top K tokens
@ -278,51 +275,59 @@ gpt_vocab::id gpt_sample_top_k_top_p(
logits_id.resize(top_k);
double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
top_k = i + 1;
probs.resize(top_k);
logits_id.resize(top_k);
break;
{
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += logits_id[i].first;
if (cumsum >= top_p) {
logits_id.resize(i+1);
break;
}
}
}
cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
// normalize again
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
}
//printf("\n");
//for (int i = 0; i < (int) probs.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//for (int i = 0; i < (int)logits_id.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
//}
//exit(0);
// sample from the obtained distribution
std::vector<double> probs;
probs.reserve(logits_id.size());
for (int i = 0; i < (int) logits_id.size(); i++) {
probs.push_back(logits_id[i].first);
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);

@ -1,7 +1,7 @@
#
# whisper
add_library(whisper-cpp
add_library(whisper-cpp SHARED
whisper.cpp
)
@ -10,6 +10,6 @@ target_link_libraries(whisper-cpp PRIVATE
)
set(TEST_TARGET whisper)
add_executable(${TEST_TARGET} main.cpp common.cpp)
add_executable(${TEST_TARGET} main.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE whisper-cpp)
target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)

@ -1,162 +0,0 @@
#include "common.h"
// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#include <cmath>
#include <regex>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
std::string trim(const std::string & s) {
std::regex e("^\\s+|\\s+$");
return std::regex_replace(s, e, "");
}
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
std::string result = s;
size_t pos = 0;
while ((pos = result.find(from, pos)) != std::string::npos) {
result.replace(pos, from.length(), to);
pos += to.length();
}
return result;
}
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
if (fname == "-") {
{
uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return false;
}
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
}
if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
return false;
}
if (stereo && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
return false;
}
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
return false;
}
if (wav.bitsPerSample != 16) {
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
return false;
}
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (stereo) {
// convert to stereo, float
pcmf32s.resize(2);
pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
}
return true;
}
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
const float rc = 1.0f / (2.0f * M_PI * cutoff);
const float dt = 1.0f / sample_rate;
const float alpha = dt / (rc + dt);
float y = data[0];
for (size_t i = 1; i < data.size(); i++) {
y = alpha * (y + data[i] - data[i - 1]);
data[i] = y;
}
}
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
const int n_samples = pcmf32.size();
const int n_samples_last = (sample_rate * last_ms) / 1000;
if (n_samples_last >= n_samples) {
// not enough samples - assume no speech
return false;
}
if (freq_thold > 0.0f) {
high_pass_filter(pcmf32, freq_thold, sample_rate);
}
float energy_all = 0.0f;
float energy_last = 0.0f;
for (int i = 0; i < n_samples; i++) {
energy_all += fabsf(pcmf32[i]);
if (i >= n_samples - n_samples_last) {
energy_last += fabsf(pcmf32[i]);
}
}
energy_all /= n_samples;
energy_last /= n_samples_last;
if (verbose) {
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
}
if (energy_last > vad_thold*energy_all) {
return false;
}
return true;
}

@ -1,40 +0,0 @@
#pragma once
// needs to match WHISPER_SAMPLE_RATE
#define COMMON_SAMPLE_RATE 16000
#include <vector>
#include <string>
std::string trim(const std::string & s);
std::string replace(
const std::string & s,
const std::string & from,
const std::string & to);
// Read WAV audio file and store the PCM data into pcmf32
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
const std::string & fname,
std::vector<float> & pcmf32,
std::vector<std::vector<float>> & pcmf32s,
bool stereo);
// Apply a high-pass frequency filter to PCM audio
// Suppresses frequencies below cutoff Hz
void high_pass_filter(
std::vector<float> & data,
float cutoff,
float sample_rate);
// Basic voice activity detection (VAD) using audio energy adaptive threshold
bool vad_simple(
std::vector<float> & pcmf32,
int sample_rate,
int last_ms,
float vad_thold,
float freq_thold,
bool verbose);

@ -1,7 +1,10 @@
#include "common.h"
#include "whisper.h"
// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#include <cmath>
#include <fstream>
#include <cstdio>
@ -50,24 +53,18 @@ void replace_all(std::string & s, const std::string & search, const std::string
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t best_of = 5;
int32_t beam_size = -1;
int32_t max_len = 0;
float word_thold = 0.01f;
float entropy_thold = 2.40f;
float logprob_thold = -1.00f;
float word_thold = 0.01f;
bool speed_up = false;
bool translate = false;
bool diarize = false;
bool split_on_word = false;
bool no_fallback = false;
bool output_txt = false;
bool output_vtt = false;
bool output_srt = false;
@ -83,7 +80,6 @@ struct whisper_params {
std::string model = "models/ggml-base.en.bin";
std::vector<std::string> fname_inp = {};
std::vector<std::string> fname_out = {};
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -92,11 +88,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-"){
params.fname_inp.push_back(arg);
continue;
}
if (arg[0] != '-') {
params.fname_inp.push_back(arg);
continue;
@ -113,22 +104,15 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
@ -152,38 +136,31 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, "\n");
}
@ -199,81 +176,90 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
const int n_segments = whisper_full_n_segments(ctx);
std::string speaker = "";
int64_t t0;
int64_t t1;
// print the last n_new segments
const int s0 = n_segments - n_new;
if (s0 == 0) {
printf("\n");
}
for (int i = s0; i < n_segments; i++) {
if (!params.no_timestamps || params.diarize) {
t0 = whisper_full_get_segment_t0(ctx, i);
t1 = whisper_full_get_segment_t1(ctx, i);
}
if (params.no_timestamps) {
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
if (!params.no_timestamps) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
if (params.diarize && pcmf32s.size() == 2) {
const int64_t n_samples = pcmf32s[0].size();
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
const int64_t is0 = timestamp_to_sample(t0, n_samples);
const int64_t is1 = timestamp_to_sample(t1, n_samples);
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
}
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s", text);
}
fflush(stdout);
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
double energy0 = 0.0f;
double energy1 = 0.0f;
std::string speaker;
for (int64_t j = is0; j < is1; j++) {
energy0 += fabs(pcmf32s[0][j]);
energy1 += fabs(pcmf32s[1][j]);
}
if (params.diarize && pcmf32s.size() == 2) {
const int64_t n_samples = pcmf32s[0].size();
if (energy0 > 1.1*energy1) {
speaker = "(speaker 0)";
} else if (energy1 > 1.1*energy0) {
speaker = "(speaker 1)";
} else {
speaker = "(speaker ?)";
}
const int64_t is0 = timestamp_to_sample(t0, n_samples);
const int64_t is1 = timestamp_to_sample(t1, n_samples);
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
}
double energy0 = 0.0f;
double energy1 = 0.0f;
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
for (int64_t j = is0; j < is1; j++) {
energy0 += fabs(pcmf32s[0][j]);
energy1 += fabs(pcmf32s[1][j]);
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
if (energy0 > 1.1*energy1) {
speaker = "(speaker 0)";
} else if (energy1 > 1.1*energy0) {
speaker = "(speaker 1)";
} else {
speaker = "(speaker ?)";
}
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
}
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s%s", speaker.c_str(), text);
}
if (params.print_colors) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
// with timestamps or speakers: each segment on new line
if (!params.no_timestamps || params.diarize) {
printf("\n");
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
fflush(stdout);
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
}
printf("\n");
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("[%s --> %s] %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
}
}
}
}
@ -354,16 +340,20 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
if (text[0] == ' ')
text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text << "\"\n";
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
fout << 10 * t0 << ", "
<< 10 * t1 << ", \""
<< text << "\"\n";
}
return true;
}
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
@ -497,7 +487,7 @@ int main(int argc, char ** argv) {
// whisper init
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
struct whisper_context * ctx = whisper_init(params.model.c_str());
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
@ -522,14 +512,90 @@ int main(int argc, char ** argv) {
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
continue;
// WAV input
{
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
if (fname_inp == "-") {
{
uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return 4;
}
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
return 5;
}
if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
return 6;
}
if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
return 6;
}
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
return 8;
}
if (wav.bitsPerSample != 16) {
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
return 9;
}
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (params.diarize) {
// convert to stereo, float
pcmf32s.resize(2);
pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
}
}
// print system information
@ -563,8 +629,6 @@ int main(int argc, char ** argv) {
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
wparams.print_realtime = false;
wparams.print_progress = params.print_progress;
wparams.print_timestamps = !params.no_timestamps;
@ -579,19 +643,11 @@ int main(int argc, char ** argv) {
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.speed_up = params.speed_up;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
whisper_print_user_data user_data = { &params, &pcmf32s };
@ -626,33 +682,34 @@ int main(int argc, char ** argv) {
// output to text file
if (params.output_txt) {
const auto fname_txt = fname_out + ".txt";
const auto fname_txt = fname_inp + ".txt";
output_txt(ctx, fname_txt.c_str());
}
// output to VTT file
if (params.output_vtt) {
const auto fname_vtt = fname_out + ".vtt";
const auto fname_vtt = fname_inp + ".vtt";
output_vtt(ctx, fname_vtt.c_str());
}
// output to SRT file
if (params.output_srt) {
const auto fname_srt = fname_out + ".srt";
const auto fname_srt = fname_inp + ".srt";
output_srt(ctx, fname_srt.c_str(), params);
}
// output to WTS file
if (params.output_wts) {
const auto fname_wts = fname_out + ".wts";
const auto fname_wts = fname_inp + ".wts";
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
}
// output to CSV file
// output to CSV file
if (params.output_csv) {
const auto fname_csv = fname_out + ".csv";
const auto fname_csv = fname_inp + ".csv";
output_csv(ctx, fname_csv.c_str());
}
}
}

File diff suppressed because it is too large Load Diff

@ -1,7 +1,6 @@
#ifndef WHISPER_H
#define WHISPER_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
@ -41,7 +40,7 @@ extern "C" {
//
// ...
//
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
// struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
@ -74,7 +73,6 @@ extern "C" {
whisper_token tid; // forced timestamp token id
float p; // probability of the token
float plog; // log probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
@ -86,20 +84,9 @@ extern "C" {
float vlen; // voice length of the token
} whisper_token_data;
typedef struct whisper_model_loader {
void * context;
size_t (*read)(void * ctx, void * output, size_t read_size);
bool (*eof)(void * ctx);
void (*close)(void * ctx);
} whisper_model_loader;
// Various functions for loading a ggml whisper model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
// Allocates all memory needed for the model and loads the model from the given file.
// Returns NULL on failure.
WHISPER_API struct whisper_context * whisper_init(const char * path_model);
// Frees all memory allocated by the model.
WHISPER_API void whisper_free(struct whisper_context * ctx);
@ -113,16 +100,6 @@ extern "C" {
int n_samples,
int n_threads);
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
// The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
struct whisper_context* ctx,
const float* samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80
@ -147,7 +124,6 @@ extern "C" {
// tokens + n_tokens is the provided context for the decoder.
// n_past is the number of tokens to use from previous decoder calls.
// Returns 0 on success
// TODO: add support for multiple decoders
WHISPER_API int whisper_decode(
struct whisper_context * ctx,
const whisper_token * tokens,
@ -155,6 +131,14 @@ extern "C" {
int n_past,
int n_threads);
// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
// You can also implement your own sampling method using the whisper_get_probs() function.
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
@ -164,7 +148,7 @@ extern "C" {
struct whisper_context * ctx,
const char * text,
whisper_token * tokens,
int n_max_tokens);
int n_max_tokens);
// Largest language id (i.e. number of available languages - 1)
WHISPER_API int whisper_lang_max_id();
@ -196,11 +180,8 @@ extern "C" {
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
// Token logits obtained from the last call to whisper_decode()
// The logits for the last token are stored in the last row
// Rows: n_tokens
// Cols: n_vocab
WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
// The probabilities for the next token
WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
@ -229,8 +210,8 @@ extern "C" {
// Available sampling strategies
enum whisper_sampling_strategy {
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
WHISPER_SAMPLING_GREEDY, // Always select the most probable token
WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
};
// Text segment callback
@ -243,16 +224,6 @@ extern "C" {
// If it returns false, the computation is aborted
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
// Logits filter callback
// Can be used to modify the logits before sampling
// If not NULL, called after applying temperature to logits
typedef void (*whisper_logits_filter_callback)(
struct whisper_context * ctx,
const whisper_token_data * tokens,
int n_tokens,
float * logits,
void * user_data);
// Parameters for the whisper_full() function
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
// whisper_full_default_params()
@ -260,32 +231,30 @@ extern "C" {
enum whisper_sampling_strategy strategy;
int n_threads;
int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
int n_max_text_ctx;
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
bool translate;
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
bool no_context;
bool single_segment; // force single segment output (useful for streaming)
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
bool print_progress; // print progress information
bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
bool print_timestamps; // print timestamps for each text segment when printing realtime
bool print_special;
bool print_progress;
bool print_realtime;
bool print_timestamps;
// [EXPERIMENTAL] token-level timestamps
bool token_timestamps; // enable token-level timestamps
float thold_pt; // timestamp token probability threshold (~0.01)
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
int max_len; // max segment length in characters
bool split_on_word; // split on word rather than on token (when used with max_len)
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
// note: these can significantly reduce the quality of the output
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
// tokens to provide to the whisper decoder as initial prompt
// tokens to provide the whisper model as initial prompt
// these are prepended to any existing text context from a previous call
const whisper_token * prompt_tokens;
int prompt_n_tokens;
@ -293,42 +262,21 @@ extern "C" {
// for auto-detection, set to nullptr, "" or "auto"
const char * language;
// common decoding parameters:
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
// fallback parameters
// ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
float temperature_inc;
float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
float logprob_thold;
float no_speech_thold; // TODO: not implemented
struct {
int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
int n_past;
} greedy;
struct {
int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
int n_past;
int beam_width;
int n_best;
} beam_search;
// called for every newly generated text segment
whisper_new_segment_callback new_segment_callback;
void * new_segment_callback_user_data;
// called each time before the encoder starts
whisper_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data;
// called by each decoder to filter obtained logits
whisper_logits_filter_callback logits_filter_callback;
void * logits_filter_callback_user_data;
};
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -355,9 +303,6 @@ extern "C" {
// A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
// Language id associated with the current context
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
// Get the start and end time of the specified segment.
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
@ -379,13 +324,6 @@ extern "C" {
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
////////////////////////////////////////////////////////////////////////////
// Temporary helpers needed for exposing ggml interface
WHISPER_API int whisper_bench_memcpy(int n_threads);
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
#ifdef __cplusplus
}
#endif

@ -301,13 +301,6 @@ struct ggml_cgraph {
int64_t perf_time_us;
};
// scratch buffer
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
@ -334,8 +327,6 @@ void ggml_free(struct ggml_context * ctx);
size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
@ -740,8 +731,6 @@ int ggml_cpu_has_f16c(void);
int ggml_cpu_has_fp16_va(void);
int ggml_cpu_has_wasm_simd(void);
int ggml_cpu_has_blas(void);
int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void);
#ifdef __cplusplus
}

@ -9,7 +9,6 @@ if (GGML_ALL_WARNINGS)
-Wcast-qual \
-Wstrict-prototypes \
-Wpointer-arith \
-Wno-unused-function \
")
else()
# todo : windows
@ -18,101 +17,17 @@ endif()
# compiler flags
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (NOT UNAME_S)
execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
endif()
if (NOT UNAME_P)
execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
endif()
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
if (UNAME_S MATCHES "Darwin")
if (NOT UNAME_P MATCHES "arm")
execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
if (SYSCTL_M MATCHES "1")
#set(UNAME_P "arm")
#set(UNAME_M "arm64")
message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
endif()
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
else()
message(STATUS "x86 detected")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
if (UNAME_S MATCHES "Darwin")
execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "AVX1.0")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
if (AVX1_M MATCHES "FMA")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
if (SSE3_M MATCHES "sse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
execute_process(COMMAND sysinfo -cpu | grep "AVX " OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysinfo -cpu | grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND sysinfo -cpu | grep "FMA " OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
execute_process(COMMAND sysinfo -cpu | grep "F16C " OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
endif()
@ -147,11 +62,7 @@ target_include_directories(${TARGET} PUBLIC
../include/ggml
)
if (MSVC)
target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
else()
target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
endif()
target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
if (BUILD_SHARED_LIBS)
target_link_libraries(${TARGET} PUBLIC

File diff suppressed because it is too large Load Diff

@ -65,14 +65,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
endif()
#
# test-mul-mat2
set(TEST_TARGET test-mul-mat2)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
#
# test0
@ -104,15 +96,3 @@ set(TEST_TARGET test3)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
#
# test-svd0 (arm)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
set(TEST_TARGET test-svd0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
endif()

@ -13,20 +13,14 @@
#include <Accelerate/Accelerate.h>
const int M = 1280;
const int N = 1536;
const int N = 1500;
const int K = 1280;
uint64_t get_time_us() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
//
// naive implementation
//
void mul_mat_f32_0(
void mul_mat_vec_f32_0(
const float * restrict src0, // M x K
const float * restrict src1, // N x K (transposed)
float * dst,
@ -42,7 +36,7 @@ void mul_mat_f32_0(
}
}
void mul_mat_f16_0(
void mul_mat_vec_f16_0(
const __fp16 * src0,
const __fp16 * src1,
float * dst,
@ -108,7 +102,7 @@ void mul_mat_f16_0(
}
// blocking with block size 32
void mul_mat_f16_1(
void mul_mat_vec_f16_1(
const __fp16 * src0,
const __fp16 * src1,
float * dst,
@ -180,7 +174,7 @@ void mul_mat_f16_1(
}
void mul_mat_f8_0(
void mul_mat_vec_f8_0(
const uint8_t * src0,
const uint8_t * src1,
float * dst,
@ -212,6 +206,12 @@ void mul_mat_f8_0(
}
}
uint64_t get_time_us() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
int main(int argc, const char ** argv) {
float * src0 = (float *)malloc(sizeof(float)*M*K);
float * src1 = (float *)malloc(sizeof(float)*N*K);
@ -258,7 +258,7 @@ int main(int argc, const char ** argv) {
method = atoi(argv[1]);
}
const int nIter = 1;
const int nIter = 10000;
const clock_t start = clock();
const uint64_t start_us = get_time_us();
@ -267,24 +267,24 @@ int main(int argc, const char ** argv) {
double sum = 0.0f;
for (int i = 0; i < nIter; i++) {
if (method == 0) {
mul_mat_f32_0(src0, src1, dst, M, N, K);
mul_mat_vec_f32_0(src0, src1, dst, M, N, K);
}
if (method == 1) {
mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
}
if (method == 2) {
mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
}
if (method == 3) {
mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
mul_mat_vec_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
}
if (method == 4) {
// Use BLAS sgemm from Accelerate framework
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, src0, K, src1, K, 0.0f, dst, N);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, K, N, 1.0f, src0, N, src1, N, 0.0f, dst, N);
}
}

@ -1,475 +0,0 @@
// quantized matrix multiplication
#include "ggml.h"
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <sys/time.h>
#ifdef __ARM_NEON
#include "arm_neon.h"
#endif
#ifndef MIN
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
const int M = 1280;
const int N = 1536;
const int K = 1280;
const int QK = 64;
#define QB 7
//#define GGML_GQ_USE_FP16_SCALE
#if defined(GGML_GQ_USE_FP16_SCALE)
#define gq_scale_t ggml_fp16_t
#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
#else
#define gq_scale_t float
#define GGML_FP32_TO_GQ(x) (x)
#define GGML_GQ_TO_FP32(x) (x)
#endif
#define gq_quant_t uint64_t
#define gq_t_bits 64
uint64_t get_time_us() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
//
// naive implementation
//
void mul_mat_f32_naive(
const float * restrict src0, // M x K
const float * restrict src1, // N x K (transposed)
float * dst,
int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
float sum = 0;
for (int l = 0; l < k; l++) {
sum += src0[i*k + l] * src1[j*k + l];
}
dst[i*n + j] = sum;
}
}
}
//
// method 1
//
void quantize_1(const float * src, void * dst, int n, int k) {
char * p0 = dst;
gq_quant_t pp[QB];
for (int j = 0; j < n; j++) {
for (int i = 0; i < k/QK; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
// find min/max
#ifdef __ARM_NEON
{
float32x4_t minv = vdupq_n_f32(FLT_MAX);
float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
for (int l = 0; l < QK; l += 4) {
float32x4_t v = vld1q_f32(src + j*k + i*QK + l);
minv = vminq_f32(minv, v);
maxv = vmaxq_f32(maxv, v);
}
float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
//printf("SIMD min/max: %f %f\n", min, max);
}
#else
{
for (int l = 0; l < QK; l++) {
const float v = src[j*k + i*QK + l];
if (v < min) min = v;
if (v > max) max = v;
}
//printf("NORM min/max: %f %f\n", min, max);
}
#endif
const float d = (max - min) / ((1 << QB) - 1);
const float id = d ? 1.0/d : 0.0;
memcpy(p0, &min, sizeof(float)); p0 += sizeof(float);
memcpy(p0, &d, sizeof(float)); p0 += sizeof(float);
//printf("min/max/d/id: %f %f %f %f\n", min, max, d, id);
for (int s = 0; s < QK/gq_t_bits; ++s) {
memset(pp, 0, sizeof(pp));
for (int l = 0; l < gq_t_bits; l++) {
const float v = src[j*k + i*QK + s*gq_t_bits + l];
const uint8_t q = (v - min)*id;
for (int b = 0; b < QB; b++) {
pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
}
}
for (int b = 0; b < QB; b++) {
memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
}
}
}
}
}
void mul_mat_gq_1(
const void * src0,
const void * src1,
float * dst,
int m, int n, int k) {
const int kp = k & ~(gq_t_bits - 1);
const char * restrict p0 = src0;
const char * restrict p1 = src1;
float s0[QB + 1];
float s1[QB + 1];
gq_quant_t m0[QB + 1];
gq_quant_t m1[QB + 1];
for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) {
float sumf = 0.0;
const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
for (int i = 0; i < kp/QK; i++) {
float min0, d0;
memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
memcpy(&d0, pp0, sizeof(float)); pp0 += sizeof(float);
float min1, d1;
memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float);
memcpy(&d1, pp1, sizeof(float)); pp1 += sizeof(float);
//printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
#if 1
// >>> General case for any QB
s0[0] = min0;
s1[0] = min1;
for (int b = 0; b < QB; b++) {
s0[b + 1] = d0*(1 << b);
s1[b + 1] = d1*(1 << b);
}
m0[0] = -1ULL;
m1[0] = -1ULL;
for (int s = 0; s < QK/gq_t_bits; ++s) {
for (int b = 0; b < QB; b++) {
memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
}
for (int q0 = 0; q0 < QB + 1; q0++) {
for (int q1 = 0; q1 < QB + 1; q1++) {
sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]);
}
}
}
#else
#endif
}
dst[ir0*n + ir1] = sumf;
}
}
}
//
// method 2
//
static inline int quantize_2_blocks_per_row(int k) {
return k/QK;
}
static inline int quantize_2_quants_per_block() {
return QK/gq_t_bits;
}
static inline int quantize_2_row_size(int k) {
const int nb = quantize_2_blocks_per_row(k);
const int nq = quantize_2_quants_per_block();
return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
}
void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
assert(k % QK == 0);
const int nb = quantize_2_blocks_per_row(k);
const int nq = quantize_2_quants_per_block();
gq_scale_t * restrict pm = (gq_scale_t *) (dst);
gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
gq_quant_t pp[QB];
for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
for (int l = 0; l < QK; l++) {
const float v = src[i*QK + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << QB) - 1);
const float id = d ? 1.0/d : 0.0;
pm[i] = GGML_FP32_TO_GQ(min);
pd[i] = GGML_FP32_TO_GQ(d);
for (int s = 0; s < nq; ++s) {
memset(pp, 0, sizeof(pp));
for (int l = 0; l < gq_t_bits; l++) {
const float v = src[i*QK + s*gq_t_bits + l];
const uint8_t q = (v - min)*id;
for (int b = 0; b < QB; b++) {
pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
}
}
for (int b = 0; b < QB; b++) {
pb[i*nq*QB + s*QB + b] = pp[b];
}
}
}
}
// reimplementation of quantize_2 using quantize_2_row
void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
assert(k % QK == 0);
for (int j = 0; j < n; j++) {
quantize_2_row(src + j*k, dst, k);
dst = (char *) dst + quantize_2_row_size(k);
}
}
void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
float sumf[(QB + 1)*(QB + 1)];
memset(sumf, 0, sizeof(sumf));
const int nb = quantize_2_blocks_per_row(n);
const int nq = quantize_2_quants_per_block();
const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
const gq_scale_t * restrict pd0 = pm0 + nb;
const gq_scale_t * restrict pd1 = pm1 + nb;
const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
#if 1
float s0[QB + 1];
float s1[QB + 1];
for (int i = 0; i < nb; i++) {
const float m0 = GGML_GQ_TO_FP32(pm0[i]);
const float d0 = GGML_GQ_TO_FP32(pd0[i]);
const float m1 = GGML_GQ_TO_FP32(pm1[i]);
const float d1 = GGML_GQ_TO_FP32(pd1[i]);
s0[0] = m0;
s1[0] = m1;
for (int b = 0; b < QB; b++) {
s0[b + 1] = d0*(1 << b);
s1[b + 1] = d1*(1 << b);
}
for (int s = 0; s < nq; ++s) {
for (int q0 = 0; q0 < QB + 1; q0++) {
const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
for (int q1 = 0; q1 < QB + 1; q1++) {
const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
sumf[q0*(QB + 1) + q1] += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
}
}
}
}
#else
// SIMD-ify with the assumptions:
// - nb is a multiple of 4
// - gq_scale_t is float
// - gq_quant_t is uint64_t
// - QB == 7
assert(nb % 4 == 0);
#ifdef __ARM_NEON
#else
// TODO
#endif
#endif
for (int q0 = 0; q0 < QB + 1; q0++) {
for (int q1 = 1; q1 < QB + 1; q1++) {
sumf[q0*(QB + 1)] += sumf[q0*(QB + 1) + q1];
}
}
*s = sumf[0];
for (int q0 = 1; q0 < QB + 1; q0++) {
*s += sumf[q0*(QB + 1)];
}
}
// use vec_dot_gq_2 to compute the dot product of two rows
void mul_mat_gq_2(
const void * src0,
const void * src1, // transposed
float * dst,
int m, int n, int k) {
assert(k % QK == 0);
const int nb = quantize_2_blocks_per_row(k);
const int nq = quantize_2_quants_per_block();
for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) {
vec_dot_gq_2(k, dst + ir1, src0, src1);
src1 = (const char *) src1 + quantize_2_row_size(k);
}
src0 = (const char *) src0 + quantize_2_row_size(k);
src1 = (const char *) src1 - n*quantize_2_row_size(k);
dst = (float *) dst + n;
}
}
int main(int argc, const char ** argv) {
assert(sizeof(gq_quant_t)*8 == gq_t_bits);
float * src0 = (float *)malloc(sizeof(float)*M*K);
float * src1 = (float *)malloc(sizeof(float)*N*K);
float * dst = (float *)malloc(sizeof(float)*M*N);
for (int i = 0; i < M*K; i++) {
src0[i] = rand() / (float)RAND_MAX;
}
for (int i = 0; i < N*K; i++) {
src1[i] = rand() / (float)RAND_MAX;
}
void * src0_gq = calloc(1, quantize_2_row_size(K)*M);
void * src1_gq = calloc(1, quantize_2_row_size(K)*N);
const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
const size_t sizegq = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
printf("compression: %f\n", (float)sizegq/sizef16);
int method = 0;
if (argc > 1) {
method = atoi(argv[1]);
}
// convert fp32 -> gq
{
const uint64_t t_start = get_time_us();
if (method == 1) {
quantize_1(src0, src0_gq, M, K);
quantize_1(src1, src1_gq, N, K);
}
if (method == 2) {
quantize_2(src0, src0_gq, M, K);
quantize_2(src1, src1_gq, N, K);
}
const uint64_t t_end = get_time_us();
printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method);
}
const int nIter = 1;
const clock_t start = clock();
const uint64_t start_us = get_time_us();
double iM = 1.0/M;
double sum = 0.0f;
for (int i = 0; i < nIter; i++) {
if (method == 0) {
mul_mat_f32_naive(src0, src1, dst, M, N, K);
}
if (method == 1) {
mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
}
if (method == 2) {
mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
}
}
for (int i = 0; i < N; i++) {
sum += dst[i]*iM;
}
{
const clock_t end = clock();
const uint64_t end_us = get_time_us();
printf("%s: elapsed ticks: %ld\n", __func__, end - start);
printf("%s: elapsed us: %d / %f ms\n", __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter);
}
printf("%f\n", sum);
free(src0);
free(src1);
free(dst);
free(src0_gq);
free(src1_gq);
return 0;
}

@ -1,218 +0,0 @@
// SVD dimensionality reduction
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <sys/time.h>
#ifdef GGML_USE_ACCELERATE
#include <Accelerate/Accelerate.h>
#endif
float frand() {
return (float) rand() / (float) RAND_MAX;
}
//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m,
// __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda,
// __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu,
// __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work,
// __CLPK_integer *__lwork,
// __CLPK_integer *__info)
int main(int argc, const char ** argv) {
int m = 10;
int n = 5;
float * A = (float *) malloc(n * m * sizeof(float));
float * A0 = (float *) malloc(n * m * sizeof(float));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand());
//A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand());
//if (i == 2) {
// A[i * m + j] += 20*frand();
//}
if ((i == 1 || i == 3) && j > m/2) {
A[i * m + j] = -A[i * m + j];
}
}
}
// average vector
//float * M = (float *) malloc(m * sizeof(float));
//{
// for (int j = 0; j < m; ++j) {
// M[j] = 0.0f;
// }
// for (int i = 0; i < n; ++i) {
// for (int j = 0; j < m; ++j) {
// M[j] += A[i * m + j];
// }
// }
// for (int j = 0; j < m; ++j) {
// M[j] /= (float) n;
// }
//}
//// subtract average vector
//for (int i = 0; i < n; ++i) {
// for (int j = 0; j < m; ++j) {
// A[i * m + j] -= M[j];
// }
//}
memcpy(A0, A, n * m * sizeof(float));
// print A
printf("A:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < m; ++j) {
printf("%9.5f ", A[i * m + j]);
}
printf("\n");
}
printf("\n");
// SVD
// A = U * S * V^T
float * U = (float *) malloc(n * m * sizeof(float));
float * S = (float *) malloc(n * sizeof(float));
float * V = (float *) malloc(n * n * sizeof(float));
int lda = m;
int ldu = m;
int ldvt = n;
float work_size;
int lwork = -1;
int info = 0;
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
lwork = (int) work_size;
printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
float * work = (float *) malloc(lwork * sizeof(float));
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
// print U
printf("U:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < m; ++j) {
printf("%9.5f ", U[i * m + j]);
}
printf("\n");
}
printf("\n");
// normalize S
{
double sum = 0.0;
for (int i = 0; i < n; ++i) {
sum += S[i];
}
sum *= sqrt((double) m);
for (int i = 0; i < n; ++i) {
S[i] /= sum;
}
}
// print S
printf("S:\n");
for (int i = 0; i < n; ++i) {
printf("- %d = %9.5f\n", i, S[i]);
}
printf("\n");
// print V
printf("V:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < n; ++j) {
printf("%9.5f ", V[i * n + j]);
}
printf("\n");
}
printf("\n");
// print A
printf("A:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < m; ++j) {
printf("%9.5f ", A[i * m + j]);
}
printf("\n");
}
printf("\n");
// compute singular vectors in U
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
U[i * m + j] *= S[i];
}
}
// normalize U
for (int i = 0; i < n; ++i) {
double sum = 0.0;
for (int j = 0; j < m; ++j) {
sum += U[i * m + j] * U[i * m + j];
}
sum = sqrt(sum);
for (int j = 0; j < m; ++j) {
U[i * m + j] /= sum*sqrt((double) m);
}
}
// print U
printf("U:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < m; ++j) {
printf("%9.5f ", U[i * m + j]);
}
printf("\n");
}
printf("\n");
// project A0 onto U
float * A1 = (float *) malloc(n * n * sizeof(float));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
A1[i * n + j] = 0.0f;
for (int k = 0; k < m; ++k) {
A1[i * n + j] += A0[i * m + k] * U[j * m + k];
}
}
}
// print A1
printf("A1:\n");
for (int i = 0; i < n; ++i) {
printf("col %d : ", i);
for (int j = 0; j < n; ++j) {
printf("%9.5f ", A1[i * n + j]);
}
printf("\n");
}
printf("\n");
return 0;
}

@ -96,8 +96,8 @@ int main(int argc, const char ** argv) {
enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
assert(res == GGML_OPT_OK);
assert(is_close(ggml_get_f32_1d(t0, 0), 5.0f, 1e-2f));
assert(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-2f));
assert(is_close(ggml_get_f32_1d(t0, 0), 5.0f, 1e-3f));
assert(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-3f));
}
{

Loading…
Cancel
Save