|
|
@ -424,6 +424,9 @@ struct whisper_context {
|
|
|
|
int64_t t_last;
|
|
|
|
int64_t t_last;
|
|
|
|
whisper_token tid_last;
|
|
|
|
whisper_token tid_last;
|
|
|
|
std::vector<float> energy; // PCM signal energy
|
|
|
|
std::vector<float> energy; // PCM signal energy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// [EXPERIMENTAL] speed-up techniques
|
|
|
|
|
|
|
|
int32_t exp_n_audio_ctx; // 0 - use default
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// load the model from a ggml file
|
|
|
|
// load the model from a ggml file
|
|
|
@ -515,15 +518,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
|
|
wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
|
|
|
|
wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
|
|
|
|
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
|
|
|
|
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
|
|
|
|
wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
|
|
|
|
wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
|
|
|
|
|
|
|
|
|
|
|
|
// this is the total memory required to run the inference
|
|
|
|
|
|
|
|
const size_t mem_required =
|
|
|
|
|
|
|
|
wctx.buf_model->size() +
|
|
|
|
|
|
|
|
wctx.buf_memory.size() +
|
|
|
|
|
|
|
|
wctx.buf_compute.size() +
|
|
|
|
|
|
|
|
wctx.buf_compute_layer.size();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// load mel filters
|
|
|
|
// load mel filters
|
|
|
@ -596,11 +590,21 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
// this is the total memory required to run the inference
|
|
|
|
|
|
|
|
const size_t mem_required =
|
|
|
|
|
|
|
|
wctx.buf_model->size() +
|
|
|
|
|
|
|
|
wctx.buf_memory.size() +
|
|
|
|
|
|
|
|
wctx.buf_compute.size() +
|
|
|
|
|
|
|
|
wctx.buf_compute_layer.size();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: mem_required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// for the big tensors, we have the option to store the data in 16-bit floats
|
|
|
|
// for the big tensors, we have the option to store the data in 16-bit floats
|
|
|
|
// in order to save memory and also to speed up the computation
|
|
|
|
// in order to save memory and also to speed up the computation
|
|
|
|
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t ctx_size = 0;
|
|
|
|
size_t ctx_size = 0;
|
|
|
|
size_t ctx_mem_size = 0;
|
|
|
|
size_t ctx_mem_size = 0;
|
|
|
|
|
|
|
|
|
|
|
@ -719,7 +723,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
|
|
|
|
|
|
|
|
|
|
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
|
|
|
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
|
|
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// create the ggml context
|
|
|
|
// create the ggml context
|
|
|
@ -980,7 +984,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
|
|
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
|
|
|
|
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
|
|
|
|
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
|
|
|
|
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: memory size = %8.2f MB\n", __func__, memory_size/1024.0/1024.0);
|
|
|
|
fprintf(stderr, "%s: memory size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// load weights
|
|
|
|
// load weights
|
|
|
@ -1039,12 +1043,12 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
|
|
|
|
|
|
|
|
|
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
|
|
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
|
|
|
|
|
|
|
|
|
|
|
//printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
|
|
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
|
|
total_size += ggml_nbytes(tensor);
|
|
|
|
total_size += ggml_nbytes(tensor);
|
|
|
|
model.n_loaded++;
|
|
|
|
model.n_loaded++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
|
|
|
fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
|
|
|
|
|
|
|
|
|
|
|
|
if (model.n_loaded == 0) {
|
|
|
|
if (model.n_loaded == 0) {
|
|
|
|
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
|
|
|
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
|
|
@ -1076,13 +1080,11 @@ static bool whisper_encode(
|
|
|
|
const auto & mel_inp = wctx.mel;
|
|
|
|
const auto & mel_inp = wctx.mel;
|
|
|
|
const auto & hparams = model.hparams;
|
|
|
|
const auto & hparams = model.hparams;
|
|
|
|
|
|
|
|
|
|
|
|
const int n_ctx = hparams.n_audio_ctx;
|
|
|
|
const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
|
|
|
const int n_state = hparams.n_audio_state;
|
|
|
|
const int n_state = hparams.n_audio_state;
|
|
|
|
const int n_head = hparams.n_audio_head;
|
|
|
|
const int n_head = hparams.n_audio_head;
|
|
|
|
const int n_layer = hparams.n_audio_layer;
|
|
|
|
const int n_layer = hparams.n_audio_layer;
|
|
|
|
|
|
|
|
|
|
|
|
const int N = n_ctx;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int n_mels = hparams.n_mels;
|
|
|
|
const int n_mels = hparams.n_mels;
|
|
|
|
assert(mel_inp.n_mel == n_mels);
|
|
|
|
assert(mel_inp.n_mel == n_mels);
|
|
|
|
|
|
|
|
|
|
|
@ -1132,7 +1134,30 @@ static bool whisper_encode(
|
|
|
|
cur = ggml_gelu(ctx0, cur);
|
|
|
|
cur = ggml_gelu(ctx0, cur);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
|
|
|
|
// ===================================================================
|
|
|
|
|
|
|
|
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
|
|
|
|
|
|
|
//static int iter = -1;
|
|
|
|
|
|
|
|
//const int n_iter = 1500/n_ctx;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//iter = (iter + 1) % n_iter;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//if (iter == 0) {
|
|
|
|
|
|
|
|
// memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
|
|
|
|
|
|
|
|
// memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
|
|
|
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int iter = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
|
|
|
|
|
|
|
|
const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
|
|
|
|
|
|
|
|
// ===================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// original:
|
|
|
|
|
|
|
|
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * inpL = cur;
|
|
|
|
struct ggml_tensor * inpL = cur;
|
|
|
|
|
|
|
|
|
|
|
@ -1198,14 +1223,14 @@ static bool whisper_encode(
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
Qcur,
|
|
|
|
Qcur,
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
|
|
|
|
0, 2, 1, 3);
|
|
|
|
0, 2, 1, 3);
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * K =
|
|
|
|
struct ggml_tensor * K =
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
Kcur,
|
|
|
|
Kcur,
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
|
|
|
|
0, 2, 1, 3);
|
|
|
|
0, 2, 1, 3);
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * V =
|
|
|
|
struct ggml_tensor * V =
|
|
|
@ -1213,9 +1238,9 @@ static bool whisper_encode(
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_reshape_3d(ctxL,
|
|
|
|
ggml_reshape_3d(ctxL,
|
|
|
|
Vcur,
|
|
|
|
Vcur,
|
|
|
|
n_state/n_head, n_head, N),
|
|
|
|
n_state/n_head, n_head, n_ctx),
|
|
|
|
1, 2, 0, 3),
|
|
|
|
1, 2, 0, 3),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, N, n_state/n_head, n_head)
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head)
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
|
|
|
|
struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
|
|
|
@ -1224,14 +1249,14 @@ static bool whisper_encode(
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
Qcur,
|
|
|
|
Qcur,
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
|
|
|
|
0, 2, 1, 3);
|
|
|
|
0, 2, 1, 3);
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * K =
|
|
|
|
struct ggml_tensor * K =
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
ggml_cpy(ctxL,
|
|
|
|
Kcur,
|
|
|
|
Kcur,
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
|
|
|
|
0, 2, 1, 3);
|
|
|
|
0, 2, 1, 3);
|
|
|
|
|
|
|
|
|
|
|
|
// K * Q
|
|
|
|
// K * Q
|
|
|
@ -1249,7 +1274,7 @@ static bool whisper_encode(
|
|
|
|
// ggml_permute(ctxL,
|
|
|
|
// ggml_permute(ctxL,
|
|
|
|
// ggml_cpy(ctxL,
|
|
|
|
// ggml_cpy(ctxL,
|
|
|
|
// Vcur,
|
|
|
|
// Vcur,
|
|
|
|
// ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
|
|
|
|
// ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
|
|
|
|
// 1, 2, 0, 3);
|
|
|
|
// 1, 2, 0, 3);
|
|
|
|
|
|
|
|
|
|
|
|
//struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
|
|
|
|
//struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
|
|
|
@ -1259,9 +1284,9 @@ static bool whisper_encode(
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_permute(ctxL,
|
|
|
|
ggml_reshape_3d(ctxL,
|
|
|
|
ggml_reshape_3d(ctxL,
|
|
|
|
Vcur,
|
|
|
|
Vcur,
|
|
|
|
n_state/n_head, n_head, N),
|
|
|
|
n_state/n_head, n_head, n_ctx),
|
|
|
|
0, 2, 1, 3),
|
|
|
|
0, 2, 1, 3),
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, N, n_head)
|
|
|
|
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head)
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
|
|
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
|
|
|
@ -1271,7 +1296,7 @@ static bool whisper_encode(
|
|
|
|
|
|
|
|
|
|
|
|
cur = ggml_cpy(ctxL,
|
|
|
|
cur = ggml_cpy(ctxL,
|
|
|
|
KQV_merged,
|
|
|
|
KQV_merged,
|
|
|
|
ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
|
|
|
|
ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, n_ctx));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// projection
|
|
|
|
// projection
|
|
|
@ -1425,6 +1450,8 @@ static bool whisper_encode(
|
|
|
|
Vcross),
|
|
|
|
Vcross),
|
|
|
|
Vcross);
|
|
|
|
Vcross);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
|
|
|
|
|
|
|
//struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
|
|
|
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx));
|
|
|
|
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx));
|
|
|
|
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx));
|
|
|
|
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx));
|
|
|
|
|
|
|
|
|
|
|
@ -1474,7 +1501,7 @@ static bool whisper_decode(
|
|
|
|
const int n_layer = hparams.n_text_layer;
|
|
|
|
const int n_layer = hparams.n_text_layer;
|
|
|
|
|
|
|
|
|
|
|
|
const int N = n_tokens;
|
|
|
|
const int N = n_tokens;
|
|
|
|
const int M = hparams.n_audio_ctx;
|
|
|
|
const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_init_params params = {
|
|
|
|
struct ggml_init_params params = {
|
|
|
|
.mem_size = wctx.buf_compute.size(),
|
|
|
|
.mem_size = wctx.buf_compute.size(),
|
|
|
@ -1819,7 +1846,9 @@ static bool whisper_decode(
|
|
|
|
// the most basic sampling scheme - select the top token
|
|
|
|
// the most basic sampling scheme - select the top token
|
|
|
|
static whisper_token_data whisper_sample_best(
|
|
|
|
static whisper_token_data whisper_sample_best(
|
|
|
|
const whisper_vocab & vocab,
|
|
|
|
const whisper_vocab & vocab,
|
|
|
|
const float * probs) {
|
|
|
|
const float * probs,
|
|
|
|
|
|
|
|
bool force_timestamp,
|
|
|
|
|
|
|
|
bool is_initial) {
|
|
|
|
whisper_token_data result = {
|
|
|
|
whisper_token_data result = {
|
|
|
|
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
|
|
|
|
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
|
|
|
|
};
|
|
|
|
};
|
|
|
@ -1842,7 +1871,18 @@ static whisper_token_data whisper_sample_best(
|
|
|
|
max_tx = std::max(max_tx, probs_id[i].first);
|
|
|
|
max_tx = std::max(max_tx, probs_id[i].first);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = vocab.token_beg; i < n_logits; i++) {
|
|
|
|
const auto i0 = is_initial ? vocab.token_beg + 101 : vocab.token_beg;
|
|
|
|
|
|
|
|
const auto i1 = is_initial ? vocab.token_beg + 101 : n_logits;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// the initial timestamp cannot be larger than 100
|
|
|
|
|
|
|
|
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
|
|
|
|
|
|
|
|
if (is_initial) {
|
|
|
|
|
|
|
|
for (int i = i0; i < n_logits; ++ i) {
|
|
|
|
|
|
|
|
probs_id[i].first = -INFINITY;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = vocab.token_beg; i < i1; i++) {
|
|
|
|
sum_ts += probs_id[i].first;
|
|
|
|
sum_ts += probs_id[i].first;
|
|
|
|
if (probs_id[i].first > max_ts) {
|
|
|
|
if (probs_id[i].first > max_ts) {
|
|
|
|
max_ts = probs_id[i].first;
|
|
|
|
max_ts = probs_id[i].first;
|
|
|
@ -1852,7 +1892,7 @@ static whisper_token_data whisper_sample_best(
|
|
|
|
|
|
|
|
|
|
|
|
// if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
|
|
|
|
// if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
|
|
|
|
// timestamp token
|
|
|
|
// timestamp token
|
|
|
|
if (sum_ts > max_tx) {
|
|
|
|
if (sum_ts > max_tx || force_timestamp) {
|
|
|
|
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
|
|
|
|
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
|
|
|
|
for (int i = 0; i < vocab.token_beg; i++) {
|
|
|
|
for (int i = 0; i < vocab.token_beg; i++) {
|
|
|
|
probs_id[i].first = -INFINITY;
|
|
|
|
probs_id[i].first = -INFINITY;
|
|
|
@ -1894,39 +1934,6 @@ static whisper_token_data whisper_sample_best(
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// samples only from the timestamps tokens
|
|
|
|
|
|
|
|
static whisper_vocab::id whisper_sample_timestamp(
|
|
|
|
|
|
|
|
const whisper_vocab & vocab,
|
|
|
|
|
|
|
|
const float * probs) {
|
|
|
|
|
|
|
|
int n_logits = vocab.id_to_token.size();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<std::pair<double, whisper_vocab::id>> probs_id;
|
|
|
|
|
|
|
|
probs_id.reserve(n_logits);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = vocab.token_beg + 1; i < n_logits; i++) {
|
|
|
|
|
|
|
|
probs_id.push_back(std::make_pair(probs[i], i));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int top_k = 10;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// find the top K tokens
|
|
|
|
|
|
|
|
std::partial_sort(
|
|
|
|
|
|
|
|
probs_id.begin(),
|
|
|
|
|
|
|
|
probs_id.begin() + top_k, probs_id.end(),
|
|
|
|
|
|
|
|
[](const std::pair<double, whisper_vocab::id> & a, const std::pair<double, whisper_vocab::id> & b) {
|
|
|
|
|
|
|
|
return a.first > b.first;
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
probs_id.resize(top_k);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//printf("\n");
|
|
|
|
|
|
|
|
//for (int i = 0; i < (int) probs_id.size(); i++) {
|
|
|
|
|
|
|
|
// printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second);
|
|
|
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return probs_id[0].second;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 500 -> 00:05.000
|
|
|
|
// 500 -> 00:05.000
|
|
|
|
// 6000 -> 01:00.000
|
|
|
|
// 6000 -> 01:00.000
|
|
|
|
static std::string to_timestamp(int64_t t, bool comma = false) {
|
|
|
|
static std::string to_timestamp(int64_t t, bool comma = false) {
|
|
|
@ -2031,6 +2038,7 @@ static bool log_mel_spectrogram(
|
|
|
|
const int n_mel,
|
|
|
|
const int n_mel,
|
|
|
|
const int n_threads,
|
|
|
|
const int n_threads,
|
|
|
|
const whisper_filters & filters,
|
|
|
|
const whisper_filters & filters,
|
|
|
|
|
|
|
|
const bool speed_up,
|
|
|
|
whisper_mel & mel) {
|
|
|
|
whisper_mel & mel) {
|
|
|
|
|
|
|
|
|
|
|
|
// Hanning window
|
|
|
|
// Hanning window
|
|
|
@ -2044,7 +2052,7 @@ static bool log_mel_spectrogram(
|
|
|
|
mel.n_len = (n_samples)/fft_step;
|
|
|
|
mel.n_len = (n_samples)/fft_step;
|
|
|
|
mel.data.resize(mel.n_mel*mel.n_len);
|
|
|
|
mel.data.resize(mel.n_mel*mel.n_len);
|
|
|
|
|
|
|
|
|
|
|
|
const int n_fft = 1 + fft_size/2;
|
|
|
|
const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
|
|
|
|
|
|
|
|
|
|
|
|
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
|
|
|
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
|
|
|
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
|
|
|
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
|
|
@ -2091,6 +2099,13 @@ static bool log_mel_spectrogram(
|
|
|
|
//}
|
|
|
|
//}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (speed_up) {
|
|
|
|
|
|
|
|
// scale down in the frequency domain results in a speed up in the time domain
|
|
|
|
|
|
|
|
for (int j = 0; j < n_fft; j++) {
|
|
|
|
|
|
|
|
fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// mel spectrogram
|
|
|
|
// mel spectrogram
|
|
|
|
for (int j = 0; j < mel.n_mel; j++) {
|
|
|
|
for (int j = 0; j < mel.n_mel; j++) {
|
|
|
|
double sum = 0.0;
|
|
|
|
double sum = 0.0;
|
|
|
@ -2161,6 +2176,12 @@ struct whisper_context * whisper_init(const char * path_model) {
|
|
|
|
|
|
|
|
|
|
|
|
void whisper_free(struct whisper_context * ctx) {
|
|
|
|
void whisper_free(struct whisper_context * ctx) {
|
|
|
|
if (ctx) {
|
|
|
|
if (ctx) {
|
|
|
|
|
|
|
|
if (ctx->model.ctx) {
|
|
|
|
|
|
|
|
ggml_free(ctx->model.ctx);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctx->model.ctx_mem) {
|
|
|
|
|
|
|
|
ggml_free(ctx->model.ctx_mem);
|
|
|
|
|
|
|
|
}
|
|
|
|
if (ctx->buf_model) {
|
|
|
|
if (ctx->buf_model) {
|
|
|
|
delete ctx->buf_model;
|
|
|
|
delete ctx->buf_model;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2171,7 +2192,21 @@ void whisper_free(struct whisper_context * ctx) {
|
|
|
|
int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
|
|
|
int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, ctx->mel)) {
|
|
|
|
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ctx->t_mel_us = ggml_time_us() - t_start_us;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
|
|
|
|
|
|
|
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
|
|
|
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
|
|
|
|
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
|
|
|
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2229,19 +2264,17 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
|
|
|
|
struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) {
|
|
|
|
struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) {
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: simplify
|
|
|
|
const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), false, false);
|
|
|
|
auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
whisper_token whisper_sample_timestamp(struct whisper_context * ctx) {
|
|
|
|
struct whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial) {
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: simplify
|
|
|
|
const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), true, is_initial);
|
|
|
|
auto res = whisper_sample_timestamp(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
|
|
|
|
|
|
|
@ -2305,11 +2338,11 @@ whisper_token whisper_token_beg(struct whisper_context * ctx) {
|
|
|
|
return ctx->vocab.token_beg;
|
|
|
|
return ctx->vocab.token_beg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
whisper_token whisper_token_translate() {
|
|
|
|
whisper_token whisper_token_translate(void) {
|
|
|
|
return whisper_vocab::token_translate;
|
|
|
|
return whisper_vocab::token_translate;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
whisper_token whisper_token_transcribe() {
|
|
|
|
whisper_token whisper_token_transcribe(void) {
|
|
|
|
return whisper_vocab::token_transcribe;
|
|
|
|
return whisper_vocab::token_transcribe;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -2325,6 +2358,27 @@ void whisper_print_timings(struct whisper_context * ctx) {
|
|
|
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
|
|
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void whisper_reset_timings(struct whisper_context * ctx) {
|
|
|
|
|
|
|
|
ctx->t_sample_us = 0;
|
|
|
|
|
|
|
|
ctx->t_encode_us = 0;
|
|
|
|
|
|
|
|
ctx->t_decode_us = 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const char * whisper_print_system_info(void) {
|
|
|
|
|
|
|
|
static std::string s;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
s = "";
|
|
|
|
|
|
|
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
|
|
|
|
|
|
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
|
|
|
|
|
|
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
|
|
|
|
|
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
|
|
|
|
|
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
|
|
|
|
|
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
|
|
|
|
|
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return s.c_str();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
|
|
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
|
|
|
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
|
|
@ -2343,7 +2397,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
|
|
|
|
|
|
|
|
/*.translate =*/ false,
|
|
|
|
/*.translate =*/ false,
|
|
|
|
/*.no_context =*/ false,
|
|
|
|
/*.no_context =*/ false,
|
|
|
|
/*.print_special_tokens =*/ false,
|
|
|
|
/*.single_segment =*/ false,
|
|
|
|
|
|
|
|
/*.print_special =*/ false,
|
|
|
|
/*.print_progress =*/ true,
|
|
|
|
/*.print_progress =*/ true,
|
|
|
|
/*.print_realtime =*/ false,
|
|
|
|
/*.print_realtime =*/ false,
|
|
|
|
/*.print_timestamps =*/ true,
|
|
|
|
/*.print_timestamps =*/ true,
|
|
|
@ -2352,6 +2407,13 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
/*.thold_pt =*/ 0.01f,
|
|
|
|
/*.thold_pt =*/ 0.01f,
|
|
|
|
/*.thold_ptsum =*/ 0.01f,
|
|
|
|
/*.thold_ptsum =*/ 0.01f,
|
|
|
|
/*.max_len =*/ 0,
|
|
|
|
/*.max_len =*/ 0,
|
|
|
|
|
|
|
|
/*.max_tokens =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.speed_up =*/ false,
|
|
|
|
|
|
|
|
/*.audio_ctx =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.prompt_tokens =*/ nullptr,
|
|
|
|
|
|
|
|
/*.prompt_n_tokens =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
/*.language =*/ "en",
|
|
|
|
/*.language =*/ "en",
|
|
|
|
|
|
|
|
|
|
|
@ -2367,6 +2429,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
|
|
|
|
|
|
|
|
/*.new_segment_callback =*/ nullptr,
|
|
|
|
/*.new_segment_callback =*/ nullptr,
|
|
|
|
/*.new_segment_callback_user_data =*/ nullptr,
|
|
|
|
/*.new_segment_callback_user_data =*/ nullptr,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.encoder_begin_callback =*/ nullptr,
|
|
|
|
|
|
|
|
/*.encoder_begin_callback_user_data =*/ nullptr,
|
|
|
|
};
|
|
|
|
};
|
|
|
|
} break;
|
|
|
|
} break;
|
|
|
|
case WHISPER_SAMPLING_BEAM_SEARCH:
|
|
|
|
case WHISPER_SAMPLING_BEAM_SEARCH:
|
|
|
@ -2381,7 +2446,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
|
|
|
|
|
|
|
|
/*.translate =*/ false,
|
|
|
|
/*.translate =*/ false,
|
|
|
|
/*.no_context =*/ false,
|
|
|
|
/*.no_context =*/ false,
|
|
|
|
/*.print_special_tokens =*/ false,
|
|
|
|
/*.single_segment =*/ false,
|
|
|
|
|
|
|
|
/*.print_special =*/ false,
|
|
|
|
/*.print_progress =*/ true,
|
|
|
|
/*.print_progress =*/ true,
|
|
|
|
/*.print_realtime =*/ false,
|
|
|
|
/*.print_realtime =*/ false,
|
|
|
|
/*.print_timestamps =*/ true,
|
|
|
|
/*.print_timestamps =*/ true,
|
|
|
@ -2390,6 +2456,13 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
/*.thold_pt =*/ 0.01f,
|
|
|
|
/*.thold_pt =*/ 0.01f,
|
|
|
|
/*.thold_ptsum =*/ 0.01f,
|
|
|
|
/*.thold_ptsum =*/ 0.01f,
|
|
|
|
/*.max_len =*/ 0,
|
|
|
|
/*.max_len =*/ 0,
|
|
|
|
|
|
|
|
/*.max_tokens =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.speed_up =*/ false,
|
|
|
|
|
|
|
|
/*.audio_ctx =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.prompt_tokens =*/ nullptr,
|
|
|
|
|
|
|
|
/*.prompt_n_tokens =*/ 0,
|
|
|
|
|
|
|
|
|
|
|
|
/*.language =*/ "en",
|
|
|
|
/*.language =*/ "en",
|
|
|
|
|
|
|
|
|
|
|
@ -2405,6 +2478,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
|
|
|
|
|
|
|
|
|
/*.new_segment_callback =*/ nullptr,
|
|
|
|
/*.new_segment_callback =*/ nullptr,
|
|
|
|
/*.new_segment_callback_user_data =*/ nullptr,
|
|
|
|
/*.new_segment_callback_user_data =*/ nullptr,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*.encoder_begin_callback =*/ nullptr,
|
|
|
|
|
|
|
|
/*.encoder_begin_callback_user_data =*/ nullptr,
|
|
|
|
};
|
|
|
|
};
|
|
|
|
} break;
|
|
|
|
} break;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2485,10 +2561,17 @@ int whisper_full(
|
|
|
|
result_all.clear();
|
|
|
|
result_all.clear();
|
|
|
|
|
|
|
|
|
|
|
|
// compute log mel spectrogram
|
|
|
|
// compute log mel spectrogram
|
|
|
|
|
|
|
|
if (params.speed_up) {
|
|
|
|
|
|
|
|
if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
|
|
|
|
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
|
|
|
|
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
|
|
|
|
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (params.token_timestamps) {
|
|
|
|
if (params.token_timestamps) {
|
|
|
|
ctx->t_beg = 0;
|
|
|
|
ctx->t_beg = 0;
|
|
|
@ -2513,6 +2596,18 @@ int whisper_full(
|
|
|
|
prompt_past.clear();
|
|
|
|
prompt_past.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// prepend the prompt tokens to the prompt_past
|
|
|
|
|
|
|
|
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
|
|
|
|
|
|
|
// parse tokens from the pointer
|
|
|
|
|
|
|
|
for (int i = 0; i < params.prompt_n_tokens; i++) {
|
|
|
|
|
|
|
|
prompt_past.push_back(params.prompt_tokens[i]);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// overwrite audio_ctx
|
|
|
|
|
|
|
|
ctx->exp_n_audio_ctx = params.audio_ctx;
|
|
|
|
|
|
|
|
|
|
|
|
// these tokens determine the task that will be performed
|
|
|
|
// these tokens determine the task that will be performed
|
|
|
|
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
|
|
|
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
|
|
|
if (whisper_is_multilingual(ctx)) {
|
|
|
|
if (whisper_is_multilingual(ctx)) {
|
|
|
@ -2548,6 +2643,13 @@ int whisper_full(
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (params.encoder_begin_callback) {
|
|
|
|
|
|
|
|
if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// encode audio features starting at offset seek
|
|
|
|
// encode audio features starting at offset seek
|
|
|
|
if (whisper_encode(ctx, seek, params.n_threads) != 0) {
|
|
|
|
if (whisper_encode(ctx, seek, params.n_threads) != 0) {
|
|
|
|
fprintf(stderr, "%s: failed to encode\n", __func__);
|
|
|
|
fprintf(stderr, "%s: failed to encode\n", __func__);
|
|
|
@ -2570,7 +2672,6 @@ int whisper_full(
|
|
|
|
|
|
|
|
|
|
|
|
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
|
|
|
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
|
|
|
|
|
|
|
|
|
|
|
bool done = false;
|
|
|
|
|
|
|
|
int seek_delta = 100*WHISPER_CHUNK_SIZE;
|
|
|
|
int seek_delta = 100*WHISPER_CHUNK_SIZE;
|
|
|
|
|
|
|
|
|
|
|
|
// print the prompt
|
|
|
|
// print the prompt
|
|
|
@ -2584,7 +2685,9 @@ int whisper_full(
|
|
|
|
int result_len = 0;
|
|
|
|
int result_len = 0;
|
|
|
|
tokens_cur.clear();
|
|
|
|
tokens_cur.clear();
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < whisper_n_text_ctx(ctx)/2 - 4; ++i) {
|
|
|
|
bool failed = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
|
|
|
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
|
|
|
|
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
|
|
|
|
fprintf(stderr, "%s: failed to decode\n", __func__);
|
|
|
|
fprintf(stderr, "%s: failed to decode\n", __func__);
|
|
|
|
return 8;
|
|
|
|
return 8;
|
|
|
@ -2601,15 +2704,19 @@ int whisper_full(
|
|
|
|
// feel free to experiment!
|
|
|
|
// feel free to experiment!
|
|
|
|
//
|
|
|
|
//
|
|
|
|
{
|
|
|
|
{
|
|
|
|
auto token = whisper_sample_best(ctx);
|
|
|
|
const auto token = (i == 0) ? whisper_sample_timestamp(ctx, true) : whisper_sample_best(ctx);
|
|
|
|
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
|
|
|
|
token.tid = whisper_token_beg(ctx);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// timestamp token - update sliding window
|
|
|
|
// timestamp token - update sliding window
|
|
|
|
if (token.id > whisper_token_beg(ctx)) {
|
|
|
|
if (token.id > whisper_token_beg(ctx)) {
|
|
|
|
seek_delta = 2*(token.id - whisper_token_beg(ctx));
|
|
|
|
const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// do not allow to go back in time
|
|
|
|
|
|
|
|
if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
|
|
|
|
|
|
|
|
seek_delta > seek_delta_new && result_len < i) {
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seek_delta = seek_delta_new;
|
|
|
|
result_len = i + 1;
|
|
|
|
result_len = i + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -2619,19 +2726,25 @@ int whisper_full(
|
|
|
|
|
|
|
|
|
|
|
|
//{
|
|
|
|
//{
|
|
|
|
// const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token[token.tid] : "[?]";
|
|
|
|
// const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token[token.tid] : "[?]";
|
|
|
|
// printf("%s: %10s %6.3f '%s'\n", __func__, tt.c_str(), token.pt, ctx->vocab.id_to_token[token.id].c_str());
|
|
|
|
// printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
|
|
|
|
//}
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
// end of text token
|
|
|
|
// end of text token
|
|
|
|
if (token.id == whisper_token_eot(ctx)) {
|
|
|
|
if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
|
|
|
|
if (result_len == 0) {
|
|
|
|
if (result_len == 0) {
|
|
|
|
if (seek + seek_delta + 100 >= seek_end) {
|
|
|
|
if (seek + seek_delta + 100 >= seek_end) {
|
|
|
|
result_len = i + 1;
|
|
|
|
result_len = i + 1;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// TODO: figure out how to resolve this
|
|
|
|
failed = true;
|
|
|
|
fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (params.single_segment) {
|
|
|
|
|
|
|
|
result_len = i + 1;
|
|
|
|
|
|
|
|
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -2642,11 +2755,21 @@ int whisper_full(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (done) {
|
|
|
|
// sometimes, the decoding can get stuck in a repetition loop
|
|
|
|
|
|
|
|
// this is a simple strategy to avoid such cases - we simply flag the decoding as failed and advance
|
|
|
|
|
|
|
|
// the sliding window by 1 second
|
|
|
|
|
|
|
|
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
|
|
|
|
|
|
|
failed = true;
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (failed) {
|
|
|
|
|
|
|
|
fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__);
|
|
|
|
|
|
|
|
seek += 100;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// shrink down to result_len
|
|
|
|
// shrink down to result_len
|
|
|
|
tokens_cur.resize(result_len);
|
|
|
|
tokens_cur.resize(result_len);
|
|
|
|
|
|
|
|
|
|
|
@ -2666,23 +2789,26 @@ int whisper_full(
|
|
|
|
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
|
|
|
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
|
|
|
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
|
|
|
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
|
|
|
|
|
|
|
|
|
|
|
if (params.print_special_tokens == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
|
|
|
|
if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
|
|
|
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tokens_cur[i].id > whisper_token_beg(ctx)) {
|
|
|
|
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
|
|
|
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
|
|
|
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
|
|
|
if (!text.empty()) {
|
|
|
|
if (!text.empty()) {
|
|
|
|
|
|
|
|
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
|
|
|
|
|
|
|
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
|
|
|
|
|
|
|
|
|
|
|
if (params.print_realtime) {
|
|
|
|
if (params.print_realtime) {
|
|
|
|
if (params.print_timestamps) {
|
|
|
|
if (params.print_timestamps) {
|
|
|
|
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
|
|
|
|
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
printf("%s", text.c_str());
|
|
|
|
printf("%s", text.c_str());
|
|
|
|
fflush(stdout);
|
|
|
|
fflush(stdout);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result_all.push_back({ t0, t1, text, {} });
|
|
|
|
result_all.push_back({ tt0, tt1, text, {} });
|
|
|
|
for (int j = i0; j <= i; j++) {
|
|
|
|
for (int j = i0; j <= i; j++) {
|
|
|
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
|
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2714,16 +2840,19 @@ int whisper_full(
|
|
|
|
if (!text.empty()) {
|
|
|
|
if (!text.empty()) {
|
|
|
|
const auto t1 = seek + seek_delta;
|
|
|
|
const auto t1 = seek + seek_delta;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
|
|
|
|
|
|
|
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
|
|
|
|
|
|
|
|
|
|
|
if (params.print_realtime) {
|
|
|
|
if (params.print_realtime) {
|
|
|
|
if (params.print_timestamps) {
|
|
|
|
if (params.print_timestamps) {
|
|
|
|
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
|
|
|
|
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
printf("%s", text.c_str());
|
|
|
|
printf("%s", text.c_str());
|
|
|
|
fflush(stdout);
|
|
|
|
fflush(stdout);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result_all.push_back({ t0, t1, text, {} });
|
|
|
|
result_all.push_back({ tt0, tt1, text, {} });
|
|
|
|
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
|
|
|
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
|
|
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
|
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2755,7 +2884,7 @@ int whisper_full_parallel(
|
|
|
|
struct whisper_full_params params,
|
|
|
|
struct whisper_full_params params,
|
|
|
|
const float * samples,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_samples,
|
|
|
|
const int n_processors) {
|
|
|
|
int n_processors) {
|
|
|
|
if (n_processors == 1) {
|
|
|
|
if (n_processors == 1) {
|
|
|
|
return whisper_full(ctx, params, samples, n_samples);
|
|
|
|
return whisper_full(ctx, params, samples, n_samples);
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -2813,10 +2942,6 @@ int whisper_full_parallel(
|
|
|
|
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
|
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
|
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
|
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const size_t memory_size =
|
|
|
|
|
|
|
|
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
|
|
|
|
|
|
|
|
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -2936,20 +3061,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
|
|
|
return ctx->result_all[i_segment].tokens[i_token].p;
|
|
|
|
return ctx->result_all[i_segment].tokens[i_token].p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const char * whisper_print_system_info() {
|
|
|
|
|
|
|
|
static std::string s;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
s = "";
|
|
|
|
|
|
|
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
|
|
|
|
|
|
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
|
|
|
|
|
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
|
|
|
|
|
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
|
|
|
|
|
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
|
|
|
|
|
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return s.c_str();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// =================================================================================================
|
|
|
|
// =================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
//
|
|
|
@ -3036,9 +3147,6 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
|
|
const int64_t t0 = segment.t0;
|
|
|
|
const int64_t t0 = segment.t0;
|
|
|
|
const int64_t t1 = segment.t1;
|
|
|
|
const int64_t t1 = segment.t1;
|
|
|
|
|
|
|
|
|
|
|
|
const int s0 = timestamp_to_sample(t0, n_samples);
|
|
|
|
|
|
|
|
const int s1 = timestamp_to_sample(t1, n_samples);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int n = tokens.size();
|
|
|
|
const int n = tokens.size();
|
|
|
|
|
|
|
|
|
|
|
|
if (n == 0) {
|
|
|
|
if (n == 0) {
|
|
|
|