From 3abcfc6fc39173ccd503bafb0a7a7c7d75b5ee38 Mon Sep 17 00:00:00 2001
From: Abitofevrything <mylo.fawcett@gmail.com>
Date: Sun, 8 Jan 2023 10:59:32 +0100
Subject: [PATCH] Add process time to whisper_print_timings

---
 whisper.cpp | 115 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 39 deletions(-)
diff --git a/whisper.cpp b/whisper.cpp
index b74c515..034b12c 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -467,12 +467,18 @@ struct whisper_decoder {
 };
 
 struct whisper_context {
-    int64_t t_load_us   = 0;
-    int64_t t_mel_us    = 0;
-    int64_t t_sample_us = 0;
-    int64_t t_encode_us = 0;
-    int64_t t_decode_us = 0;
-    int64_t t_start_us  = 0;
+    int64_t t_real_load_us      = 0;
+    int64_t t_process_load_us   = 0;
+    int64_t t_real_mel_us       = 0;
+    int64_t t_process_mel_us    = 0;
+    int64_t t_real_sample_us    = 0;
+    int64_t t_process_sample_us = 0;
+    int64_t t_real_encode_us    = 0;
+    int64_t t_process_encode_us = 0;
+    int64_t t_real_decode_us    = 0;
+    int64_t t_process_decode_us = 0;
+    int64_t t_real_start_us     = 0;
+    int64_t t_process_start_us  = 0;
 
     ggml_type wtype; // weight type (FP32 or FP16)
 
@@ -597,9 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
 static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
     fprintf(stderr, "%s: loading model\n", __func__);
 
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();
 
-    wctx.t_start_us = t_start_us;
+    wctx.t_real_start_us = t_real_start_us;
+    wctx.t_process_start_us = t_process_start_us;
 
     auto & model = wctx.model;
     auto & vocab = wctx.vocab;
@@ -1208,7 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     wctx.rng = std::mt19937(0);
 
-    wctx.t_load_us = ggml_time_us() - t_start_us;
+    wctx.t_real_load_us = ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_load_us = ggml_process_time_us() - t_process_start_us;
 
     return true;
 }
@@ -1226,7 +1235,8 @@ static bool whisper_encode(
         whisper_context & wctx,
               const int   mel_offset,
               const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();
 
     const auto & model   = wctx.model;
     const auto & mel_inp = wctx.mel;
@@ -1619,7 +1629,8 @@ static bool whisper_encode(
 
     ggml_free(ctx0);
 
-    wctx.t_encode_us += ggml_time_us() - t_start_us;
+    wctx.t_real_encode_us += ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_encode_us += ggml_process_time_us() - t_process_start_us;
 
     return true;
 }
@@ -1641,7 +1652,8 @@ static bool whisper_decode(
               const int   n_tokens,
               const int   n_past,
               const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();
 
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
@@ -1992,7 +2004,8 @@ static bool whisper_decode(
 
     ggml_free(ctx0);
 
-    wctx.t_decode_us += ggml_time_us() - t_start_us;
+    wctx.t_real_decode_us += ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_decode_us += ggml_process_time_us() - t_process_start_us;
 
     return true;
 }
@@ -2107,7 +2120,8 @@ static bool log_mel_spectrogram(
   const whisper_filters & filters,
              const bool   speed_up,
             whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_real_start_us = ggml_real_time_us();
+    const int64_t t_process_start_us = ggml_process_time_us();
 
     // Hanning window
     std::vector<float> hann;
@@ -2216,7 +2230,8 @@ static bool log_mel_spectrogram(
         mel.data[i] = (mel.data[i] + 4.0)/4.0;
     }
 
-    wctx.t_mel_us += ggml_time_us() - t_start_us;
+    wctx.t_real_mel_us = ggml_real_time_us() - t_real_start_us;
+    wctx.t_process_mel_us = ggml_process_time_us() - t_process_start_us;
 
     return true;
 }
@@ -2642,21 +2657,29 @@ whisper_token whisper_token_transcribe(void) {
 }
 
 void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_real_time_us();
+    const int64_t t_real_end_us = ggml_real_time_us();
+    const int64_t t_process_end_us = ggml_process_time_us();
 
     fprintf(stderr, "\n");
-    fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
-    fprintf(stderr, "%s:      mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
-    fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
-    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
-    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
-    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+    fprintf(stderr, "%s:     load time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_load_us/1000.0f, ctx->t_real_load_us/1000.0f);
+    fprintf(stderr, "%s:      mel time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_mel_us/1000.0f, ctx->t_real_mel_us/1000.0f);
+    fprintf(stderr, "%s:   sample time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_sample_us/1000.0f, ctx->t_real_sample_us/1000.0f);
+    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
+        ctx->t_process_encode_us/1000.0f, ctx->t_process_encode_us/1000.0f/ctx->model.hparams.n_audio_layer,
+        ctx->t_real_encode_us/1000.0f, ctx->t_real_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
+    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
+        ctx->t_process_decode_us/1000.0f, ctx->t_process_decode_us/1000.0f/ctx->model.hparams.n_text_layer,
+        ctx->t_real_decode_us/1000.0f, ctx->t_real_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
+    fprintf(stderr, "%s:    total time = %8.2f ms (real %8.2f ms)\n", __func__, (t_process_end_us - ctx->t_process_start_us)/1000.f, (t_real_end_us - ctx->t_real_start_us)/1000.0f);
 }
 
 void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_sample_us = 0;
-    ctx->t_encode_us = 0;
-    ctx->t_decode_us = 0;
+    ctx->t_real_sample_us = 0;
+    ctx->t_process_sample_us = 0;
+    ctx->t_real_encode_us = 0;
+    ctx->t_process_encode_us = 0;
+    ctx->t_real_decode_us = 0;
+    ctx->t_process_decode_us = 0;
 }
 
 const char * whisper_print_system_info(void) {
@@ -3455,7 +3478,8 @@ int whisper_full(
                 }
 
                 {
-                    const int64_t t_start_sample_us = ggml_time_us();
+                    const int64_t t_real_start_sample_us = ggml_real_time_us();
+                    const int64_t t_process_start_sample_us = ggml_process_time_us();
 
                     whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur);
 
@@ -3474,12 +3498,14 @@ int whisper_full(
                         memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                     }
 
-                    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                    ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
                 }
             }
 
             for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
+                const int64_t t_real_start_sample_us = ggml_real_time_us();
+                const int64_t t_process_start_sample_us = ggml_process_time_us();
 
                 // store the KV caches of all decoders when doing beam-search
                 if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@@ -3672,7 +3698,8 @@ int whisper_full(
                     }
                 }
 
-                ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
 
                 // obtain logits for the next token
                 for (int j = 0; j < n_decoders_cur; ++j) {
@@ -3693,13 +3720,15 @@ int whisper_full(
                     }
 
                     {
-                        const int64_t t_start_sample_us = ggml_time_us();
+                        const int64_t t_real_start_sample_us = ggml_real_time_us();
+                        const int64_t t_process_start_sample_us = ggml_process_time_us();
 
                         whisper_process_logits(*ctx, params, decoder, t_cur);
 
                         ++decoder.kv_self.n;
 
-                        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                        ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
+                        ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
                     }
                 }
             }
@@ -3992,10 +4021,14 @@ int whisper_full_parallel(
             }
         }
 
-        ctx->t_mel_us    += ctxs[i].t_mel_us;
-        ctx->t_sample_us += ctxs[i].t_sample_us;
-        ctx->t_encode_us += ctxs[i].t_encode_us;
-        ctx->t_decode_us += ctxs[i].t_decode_us;
+        ctx->t_real_mel_us       += ctxs[i].t_real_mel_us;
+        ctx->t_process_mel_us    += ctxs[i].t_process_mel_us;
+        ctx->t_real_sample_us    += ctxs[i].t_real_sample_us;
+        ctx->t_process_sample_us += ctxs[i].t_process_sample_us;
+        ctx->t_real_encode_us    += ctxs[i].t_real_encode_us;
+        ctx->t_process_encode_us += ctxs[i].t_process_encode_us;
+        ctx->t_real_decode_us    += ctxs[i].t_real_decode_us;
+        ctx->t_process_decode_us += ctxs[i].t_process_decode_us;
 
         kv_cache_free(ctx->kv_cross);
 
@@ -4005,10 +4038,14 @@ int whisper_full_parallel(
     }
 
     // average the timings
-    ctx->t_mel_us    /= n_processors;
-    ctx->t_sample_us /= n_processors;
-    ctx->t_encode_us /= n_processors;
-    ctx->t_decode_us /= n_processors;
+    ctx->t_real_mel_us       /= n_processors;
+    ctx->t_process_mel_us    /= n_processors;
+    ctx->t_real_sample_us    /= n_processors;
+    ctx->t_process_sample_us /= n_processors;
+    ctx->t_real_encode_us    /= n_processors;
+    ctx->t_process_encode_us /= n_processors;
+    ctx->t_real_decode_us    /= n_processors;
+    ctx->t_process_decode_us /= n_processors;
 
     // print information about the audio boundaries
     fprintf(stderr, "\n");