Add process time to whisper_print_timings

2 years ago · 3abcfc6fc3
parent bde7ee3751
commit 3abcfc6fc3
1 changed files with 76 additions and 39 deletions
--- a/whisper.cpp
+++ b/whisper.cpp
@ -467,12 +467,18 @@ struct whisper_decoder {
 };

 struct whisper_context {
-    int64_t t_load_us   = 0;
-    int64_t t_mel_us    = 0;
-    int64_t t_sample_us = 0;
-    int64_t t_encode_us = 0;
-    int64_t t_decode_us = 0;
-    int64_t t_start_us  = 0;
+    int64_t t_real_load_us      = 0;
+    int64_t t_process_load_us   = 0;
+    int64_t t_real_mel_us       = 0;
+    int64_t t_process_mel_us    = 0;
+    int64_t t_real_sample_us    = 0;
+    int64_t t_process_sample_us = 0;
+    int64_t t_real_encode_us    = 0;
+    int64_t t_process_encode_us = 0;
+    int64_t t_real_decode_us    = 0;
+    int64_t t_process_decode_us = 0;
+    int64_t t_real_start_us     = 0;
+    int64_t t_process_start_us  = 0;

    ggml_type wtype; // weight type (FP32 or FP16)

@ -597,9 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
 static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
    fprintf(stderr, "%s: loading model\n", __func__);

-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();

-    wctx.t_start_us = t_start_us;
+    wctx.t_real_start_us = t_real_start_us;
+    wctx.t_process_start_us = t_process_start_us;

    auto & model = wctx.model;
    auto & vocab = wctx.vocab;
@ -1208,7 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

    wctx.rng = std::mt19937(0);

-    wctx.t_load_us = ggml_time_us() - t_start_us;
+    wctx.t_real_load_us = ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_load_us = ggml_process_time_us() - t_process_start_us;

    return true;
 }
@ -1226,7 +1235,8 @@ static bool whisper_encode(
        whisper_context & wctx,
              const int   mel_offset,
              const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();

    const auto & model   = wctx.model;
    const auto & mel_inp = wctx.mel;
@ -1619,7 +1629,8 @@ static bool whisper_encode(

    ggml_free(ctx0);

-    wctx.t_encode_us += ggml_time_us() - t_start_us;
+    wctx.t_real_encode_us += ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_encode_us += ggml_process_time_us() - t_process_start_us;

    return true;
 }
@ -1641,7 +1652,8 @@ static bool whisper_decode(
              const int   n_tokens,
              const int   n_past,
              const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();

    const auto & model   = wctx.model;
    const auto & hparams = model.hparams;
@ -1992,7 +2004,8 @@ static bool whisper_decode(

    ggml_free(ctx0);

-    wctx.t_decode_us += ggml_time_us() - t_start_us;
+    wctx.t_real_decode_us += ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_decode_us += ggml_process_time_us() - t_process_start_us;

    return true;
 }
@ -2107,7 +2120,8 @@ static bool log_mel_spectrogram(
  const whisper_filters & filters,
             const bool   speed_up,
            whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();

    // Hanning window
    std::vector<float> hann;
@ -2216,7 +2230,8 @@ static bool log_mel_spectrogram(
        mel.data[i] = (mel.data[i] + 4.0)/4.0;
    }

-    wctx.t_mel_us += ggml_time_us() - t_start_us;
+    wctx.t_real_mel_us = ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_mel_us = ggml_process_time_us() - t_process_start_us;

    return true;
 }
@ -2642,21 +2657,29 @@ whisper_token whisper_token_transcribe(void) {
 }

 void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_real_time_us();
+    const int64_t t_real_end_us = ggml_real_time_us();
+    const int64_t t_process_end_us = ggml_process_time_us();

    fprintf(stderr, "\n");
-    fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
-    fprintf(stderr, "%s:      mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
-    fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
-    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
-    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
-    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+    fprintf(stderr, "%s:     load time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_load_us/1000.0f, ctx->t_real_load_us/1000.0f);
+    fprintf(stderr, "%s:      mel time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_mel_us/1000.0f, ctx->t_real_mel_us/1000.0f);
+    fprintf(stderr, "%s:   sample time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_sample_us/1000.0f, ctx->t_real_sample_us/1000.0f);
+    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
+        ctx->t_process_encode_us/1000.0f, ctx->t_process_encode_us/1000.0f/ctx->model.hparams.n_audio_layer,
+        ctx->t_real_encode_us/1000.0f, ctx->t_real_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
+    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
+        ctx->t_process_decode_us/1000.0f, ctx->t_process_decode_us/1000.0f/ctx->model.hparams.n_text_layer,
+        ctx->t_real_decode_us/1000.0f, ctx->t_real_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
+    fprintf(stderr, "%s:    total time = %8.2f ms (real %8.2f ms)\n", __func__, (t_process_end_us - ctx->t_process_start_us)/1000.f, (t_real_end_us - ctx->t_real_start_us)/1000.0f);
 }

 void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_sample_us = 0;
-    ctx->t_encode_us = 0;
-    ctx->t_decode_us = 0;
+    ctx->t_real_sample_us = 0;
+    ctx->t_process_sample_us = 0;
+    ctx->t_real_encode_us = 0;
+    ctx->t_process_encode_us = 0;
+    ctx->t_real_decode_us = 0;
+    ctx->t_process_decode_us = 0;
 }

 const char * whisper_print_system_info(void) {
@ -3455,7 +3478,8 @@ int whisper_full(
                }

                {
-                    const int64_t t_start_sample_us = ggml_time_us();
+                    const int64_t t_real_start_sample_us = ggml_real_time_us();
+                    const int64_t t_process_start_sample_us = ggml_process_time_us();

                    whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur);

@ -3474,12 +3498,14 @@ int whisper_full(
                        memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                    }

-                    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                    ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
                }
            }

            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
+                const int64_t t_real_start_sample_us = ggml_real_time_us();
+                const int64_t t_process_start_sample_us = ggml_process_time_us();

                // store the KV caches of all decoders when doing beam-search
                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@ -3672,7 +3698,8 @@ int whisper_full(
                    }
                }

-                ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;

                // obtain logits for the next token
                for (int j = 0; j < n_decoders_cur; ++j) {
@ -3693,13 +3720,15 @@ int whisper_full(
                    }

                    {
-                        const int64_t t_start_sample_us = ggml_time_us();
+                        const int64_t t_real_start_sample_us = ggml_real_time_us();
+                        const int64_t t_process_start_sample_us = ggml_process_time_us();

                        whisper_process_logits(*ctx, params, decoder, t_cur);

                        ++decoder.kv_self.n;

-                        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                        ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                        ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
                    }
                }
            }
@ -3992,10 +4021,14 @@ int whisper_full_parallel(
            }
        }

-        ctx->t_mel_us    += ctxs[i].t_mel_us;
-        ctx->t_sample_us += ctxs[i].t_sample_us;
-        ctx->t_encode_us += ctxs[i].t_encode_us;
-        ctx->t_decode_us += ctxs[i].t_decode_us;
+        ctx->t_real_mel_us       += ctxs[i].t_real_mel_us;
+        ctx->t_process_mel_us    += ctxs[i].t_process_mel_us;
+        ctx->t_real_sample_us    += ctxs[i].t_real_sample_us;
+        ctx->t_process_sample_us += ctxs[i].t_process_sample_us;
+        ctx->t_real_encode_us    += ctxs[i].t_real_encode_us;
+        ctx->t_process_encode_us += ctxs[i].t_process_encode_us;
+        ctx->t_real_decode_us    += ctxs[i].t_real_decode_us;
+        ctx->t_process_decode_us += ctxs[i].t_process_decode_us;

        kv_cache_free(ctx->kv_cross);

@ -4005,10 +4038,14 @@ int whisper_full_parallel(
    }

    // average the timings
-    ctx->t_mel_us    /= n_processors;
-    ctx->t_sample_us /= n_processors;
-    ctx->t_encode_us /= n_processors;
-    ctx->t_decode_us /= n_processors;
+    ctx->t_real_mel_us       /= n_processors;
+    ctx->t_process_mel_us    /= n_processors;
+    ctx->t_real_sample_us    /= n_processors;
+    ctx->t_process_sample_us /= n_processors;
+    ctx->t_real_encode_us    /= n_processors;
+    ctx->t_process_encode_us /= n_processors;
+    ctx->t_real_decode_us    /= n_processors;
+    ctx->t_process_decode_us /= n_processors;

    // print information about the audio boundaries
    fprintf(stderr, "\n");