whisper : add beam-search support

3 years ago · 5548a1986f
parent 5e97f80fc5
commit 5548a1986f
3 changed files with 187 additions and 18 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -652,6 +652,9 @@ int main(int argc, char ** argv) {
            wparams.speed_up          = params.speed_up;
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
--- a/whisper.cpp
+++ b/whisper.cpp
@ -433,7 +433,8 @@ struct whisper_sequence {
    // the accumulated transcription in the current interation (used to truncate the tokens array)
    int result_len;
-    double sum_logprobs; // the sum of the log probabilities of the tokens
+    double sum_logprobs_all; // the sum of the log probabilities of the tokens
    double sum_logprobs;     // the sum of the log probabilities of the tokens (first result_len tokens)
    double avg_logprobs;     // the average log probability of the tokens
    double entropy;          // the entropy of the tokens
    double score;            // likelihood rank score
@ -1650,7 +1651,7 @@ static bool whisper_decode(
    const int N = n_tokens;
    const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
-    WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
+    //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
    struct ggml_init_params params;
    params.mem_size   = wctx.buf_compute.size();
@ -3100,6 +3101,74 @@ static whisper_token_data whisper_sample_token(
    return result;
 }
 static std::vector<whisper_token_data> whisper_sample_token_topk(
            whisper_context & ctx,
      const whisper_decoder & decoder,
                        int   k) {
    const auto & vocab = ctx.vocab;
    const auto & probs    = decoder.probs;
    const auto & logits   = decoder.logits;
    const auto & logprobs = decoder.logprobs;
    const int n_logits = vocab.n_vocab;
    auto & logits_id = ctx.logits_id;
    logits_id.clear();
    for (int i = 0; i < n_logits; ++i) {
        logits_id.push_back({ logits[i], i });
    }
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + k, logits_id.end(),
            [](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
                return a.first > b.first;
            });
    std::vector<whisper_token_data> result(k);
    {
        double sum_ts = 0.0;
        double max_ts = 0.0;
        for (int i = vocab.token_beg; i < n_logits; i++) {
            if (probs[i] == -INFINITY) {
                continue;
            }
            sum_ts += probs[i];
            if (max_ts < probs[i]) {
                max_ts = probs[i];
                result[0].tid = i;
            }
        }
        result[0].pt = max_ts/(sum_ts + 1e-10);
        result[0].ptsum = sum_ts;
    }
    for (int i = 0; i < k; ++i) {
        result[i].id    = logits_id[i].second;
        result[i].p     = probs[result[i].id];
        result[i].plog  = logprobs[result[i].id];
        result[i].tid   = result[0].tid;
        result[i].pt    = result[0].pt;
        result[i].ptsum = result[0].ptsum;
        result[i].t0    = -1;
        result[i].t1    = -1;
        result[i].vlen  = 0.0f;
        if (result[i].id >= vocab.token_beg) {
            result[i].tid = result[i].id;
            result[i].pt  = result[i].p;
        }
    }
    return result;
 }
 // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L178-L192
 static void whisper_sequence_score(
        const struct whisper_full_params & params,
@ -3120,7 +3189,7 @@ static void whisper_sequence_score(
    double penalty = sequence.result_len;
    if (params.length_penalty > 0.0f) {
-        penalty = pow((5.0 + penalty) / 6.0, params.length_penalty);
+        penalty = pow((5.0 + penalty)/6.0, params.length_penalty);
    }
    sequence.score = result/penalty;
@ -3141,7 +3210,8 @@ static void whisper_sequence_score(
        for (const auto & kv : token_counts) {
            const auto p = kv.second/(double)cnt;
            entropy -= p*log(p);
-            //printf("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
+
            //WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
        }
        sequence.entropy = entropy;
@ -3293,6 +3363,25 @@ int whisper_full(
    std::vector<whisper_token> prompt;
    prompt.reserve(whisper_n_text_ctx(ctx));
    // beam-search helpers
    struct kv_buf {
        std::vector<uint8_t> k;
        std::vector<uint8_t> v;
    };
    std::vector<kv_buf> kv_bufs;
    struct beam_candidate {
        int decoder_idx;
        int seek_delta;
        bool has_ts;
        whisper_sequence sequence;
    };
    std::vector<beam_candidate> beam_candidates;
    // main loop
    while (true) {
        const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
@ -3361,9 +3450,10 @@ int whisper_full(
                decoder.sequence.tokens.clear();
                decoder.sequence.result_len       = 0;
                decoder.sequence.sum_logprobs_all = 0.0;
                decoder.sequence.sum_logprobs     = -INFINITY;
                decoder.sequence.avg_logprobs     = -INFINITY;
-                decoder.sequence.entropy      = 0.0f;
+                decoder.sequence.entropy          = 0.0;
                decoder.sequence.score            = -INFINITY;
                decoder.n_past     = 0;
@ -3412,7 +3502,8 @@ int whisper_full(
                    for (int j = 1; j < n_decoders_cur; ++j) {
                        auto & decoder = ctx->decoders[j];
-                        memcpy(decoder.kv_self.buf.data(), ctx->decoders[0].kv_self.buf.data(), decoder.kv_self.buf.size());
+                        memcpy(decoder.kv_self.k->data, ctx->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
                        memcpy(decoder.kv_self.v->data, ctx->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
                        decoder.n_past += prompt.size();
@ -3428,6 +3519,25 @@ int whisper_full(
            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
                const int64_t t_start_sample_us = ggml_time_us();
                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
                    kv_bufs.resize(n_decoders_cur);
                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = ctx->decoders[j];
                        if (decoder.completed || decoder.failed) {
                            continue;
                        }
                        kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
                        kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
                        memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
                        memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
                    }
                    beam_candidates.clear();
                }
                for (int j = 0; j < n_decoders_cur; ++j) {
                    auto & decoder = ctx->decoders[j];
@ -3443,12 +3553,65 @@ int whisper_full(
                                } else {
                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, decoder, false));
                                }
                                decoder.sequence.sum_logprobs_all += decoder.sequence.tokens.back().plog;
                            } break;
                        case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
                            {
-                                // TODO: ..
+                                const auto tokens_new = whisper_sample_token_topk(*ctx, decoder, params.beam_search.beam_size);
                                for (const auto & token : tokens_new) {
                                    beam_candidates.push_back({ j, decoder.seek_delta, decoder.has_ts, decoder.sequence });
                                    beam_candidates.back().sequence.tokens.push_back(token);
                                    beam_candidates.back().sequence.sum_logprobs_all += token.plog;
                                    //WHISPER_PRINT_DEBUG("%s: beam candidate: %s (%f, %f)\n", __func__, ctx->vocab.id_to_token.at(token.id).c_str(), token.plog, beam_candidates.back().sequence.sum_logprobs_all);
                                }
                            } break;
                    };
                }
                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
                    std::sort(
                            beam_candidates.begin(),
                            beam_candidates.end(),
                            [](const beam_candidate & a, const beam_candidate & b) {
                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
                    });
                    int cur_c = 0;
                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = ctx->decoders[j];
                        if (decoder.completed || decoder.failed) {
                            continue;
                        }
                        auto & cur = beam_candidates[cur_c++];
                        while (beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
                            ++cur_c;
                        }
                        decoder.sequence   = cur.sequence;
                        decoder.seek_delta = cur.seek_delta;
                        decoder.has_ts     = cur.has_ts;
                        memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
                        memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
                        WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
                    }
                }
                for (int j = 0; j < n_decoders_cur; ++j) {
                    auto & decoder = ctx->decoders[j];
                    if (decoder.completed || decoder.failed) {
                        continue;
                    }
                    auto & has_ts     = decoder.has_ts;
                    auto & failed     = decoder.failed;
@ -3659,6 +3822,7 @@ int whisper_full(
                    } else {
                        text += whisper_token_to_str(ctx, tokens_cur[i].id);
                    }
                    if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
                        const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
                        if (!text.empty()) {
@ -4077,6 +4241,8 @@ static void whisper_exp_compute_token_level_timestamps(
                p1--;
            }
            //printf("p0=%d p1=%d t0=%lld t1=%lld\n", p0, p1, tokens[p0].t0, tokens[p1].t1);
            if (p1 > p0) {
                double psum = 0.0;
                for (int j = p0; j <= p1; j++) {
@ -4205,11 +4371,11 @@ static void whisper_exp_compute_token_level_timestamps(
    // debug info
    //for (int j = 0; j < n; ++j) {
    //    const auto & token = tokens[j];
-    //    const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
+    //    const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(&ctx, token.tid) : "[?]";
    //    printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
-    //            tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(ctx, token.id));
+    //            tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(&ctx, token.id));
-    //    if (tokens[j].id >= whisper_token_eot(ctx)) {
+    //    if (tokens[j].id >= whisper_token_eot(&ctx)) {
    //        continue;
    //    }
    //}
--- a/whisper.h
+++ b/whisper.h
@ -290,7 +290,7 @@ extern "C" {
        struct {
            int beam_size;
-            float patience;
+            float patience; // TODO: not implemented
        } beam_search;
        whisper_new_segment_callback new_segment_callback;