diff --git a/whisper.cpp b/whisper.cpp index b74c515..034b12c 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -467,12 +467,18 @@ struct whisper_decoder { }; struct whisper_context { - int64_t t_load_us = 0; - int64_t t_mel_us = 0; - int64_t t_sample_us = 0; - int64_t t_encode_us = 0; - int64_t t_decode_us = 0; - int64_t t_start_us = 0; + int64_t t_real_load_us = 0; + int64_t t_process_load_us = 0; + int64_t t_real_mel_us = 0; + int64_t t_process_mel_us = 0; + int64_t t_real_sample_us = 0; + int64_t t_process_sample_us = 0; + int64_t t_real_encode_us = 0; + int64_t t_process_encode_us = 0; + int64_t t_real_decode_us = 0; + int64_t t_process_decode_us = 0; + int64_t t_real_start_us = 0; + int64_t t_process_start_us = 0; ggml_type wtype; // weight type (FP32 or FP16) @@ -597,9 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) { static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) { fprintf(stderr, "%s: loading model\n", __func__); - const int64_t t_start_us = ggml_time_us(); + const int64_t t_real_start_us = ggml_real_time_us(); + const int64_t t_process_start_us = ggml_process_time_us(); - wctx.t_start_us = t_start_us; + wctx.t_real_start_us = t_real_start_us; + wctx.t_process_start_us = t_process_start_us; auto & model = wctx.model; auto & vocab = wctx.vocab; @@ -1208,7 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con wctx.rng = std::mt19937(0); - wctx.t_load_us = ggml_time_us() - t_start_us; + wctx.t_real_load_us = ggml_real_time_us() - t_real_start_us; + wctx.t_process_load_us = ggml_process_time_us() - t_process_start_us; return true; } @@ -1226,7 +1235,8 @@ static bool whisper_encode( whisper_context & wctx, const int mel_offset, const int n_threads) { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_real_start_us = ggml_real_time_us(); + const int64_t t_process_start_us = ggml_process_time_us(); const auto & model = wctx.model; const auto & mel_inp = wctx.mel; @@ -1619,7 +1629,8 @@ static bool whisper_encode( ggml_free(ctx0); - wctx.t_encode_us += ggml_time_us() - t_start_us; + wctx.t_real_encode_us += ggml_real_time_us() - t_real_start_us; + wctx.t_process_encode_us += ggml_process_time_us() - t_process_start_us; return true; } @@ -1641,7 +1652,8 @@ static bool whisper_decode( const int n_tokens, const int n_past, const int n_threads) { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_real_start_us = ggml_real_time_us(); + const int64_t t_process_start_us = ggml_process_time_us(); const auto & model = wctx.model; const auto & hparams = model.hparams; @@ -1992,7 +2004,8 @@ static bool whisper_decode( ggml_free(ctx0); - wctx.t_decode_us += ggml_time_us() - t_start_us; + wctx.t_real_decode_us += ggml_real_time_us() - t_real_start_us; + wctx.t_process_decode_us += ggml_process_time_us() - t_process_start_us; return true; } @@ -2107,7 +2120,8 @@ static bool log_mel_spectrogram( const whisper_filters & filters, const bool speed_up, whisper_mel & mel) { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_real_start_us = ggml_real_time_us(); + const int64_t t_process_start_us = ggml_process_time_us(); // Hanning window std::vector hann; @@ -2216,7 +2230,8 @@ static bool log_mel_spectrogram( mel.data[i] = (mel.data[i] + 4.0)/4.0; } - wctx.t_mel_us += ggml_time_us() - t_start_us; + wctx.t_real_mel_us = ggml_real_time_us() - t_real_start_us; + wctx.t_process_mel_us = ggml_process_time_us() - t_process_start_us; return true; } @@ -2642,21 +2657,29 @@ whisper_token whisper_token_transcribe(void) { } void whisper_print_timings(struct whisper_context * ctx) { - const int64_t t_end_us = ggml_real_time_us(); + const int64_t t_real_end_us = ggml_real_time_us(); + const int64_t t_process_end_us = ggml_process_time_us(); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f); - fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f); - fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f); - fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); - fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); + fprintf(stderr, "%s: load time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_load_us/1000.0f, ctx->t_real_load_us/1000.0f); + fprintf(stderr, "%s: mel time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_mel_us/1000.0f, ctx->t_real_mel_us/1000.0f); + fprintf(stderr, "%s: sample time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_sample_us/1000.0f, ctx->t_real_sample_us/1000.0f); + fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__, + ctx->t_process_encode_us/1000.0f, ctx->t_process_encode_us/1000.0f/ctx->model.hparams.n_audio_layer, + ctx->t_real_encode_us/1000.0f, ctx->t_real_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); + fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__, + ctx->t_process_decode_us/1000.0f, ctx->t_process_decode_us/1000.0f/ctx->model.hparams.n_text_layer, + ctx->t_real_decode_us/1000.0f, ctx->t_real_decode_us/1000.0f/ctx->model.hparams.n_text_layer); + fprintf(stderr, "%s: total time = %8.2f ms (real %8.2f ms)\n", __func__, (t_process_end_us - ctx->t_process_start_us)/1000.f, (t_real_end_us - ctx->t_real_start_us)/1000.0f); } void whisper_reset_timings(struct whisper_context * ctx) { - ctx->t_sample_us = 0; - ctx->t_encode_us = 0; - ctx->t_decode_us = 0; + ctx->t_real_sample_us = 0; + ctx->t_process_sample_us = 0; + ctx->t_real_encode_us = 0; + ctx->t_process_encode_us = 0; + ctx->t_real_decode_us = 0; + ctx->t_process_decode_us = 0; } const char * whisper_print_system_info(void) { @@ -3455,7 +3478,8 @@ int whisper_full( } { - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_real_start_sample_us = ggml_real_time_us(); + const int64_t t_process_start_sample_us = ggml_process_time_us(); whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur); @@ -3474,12 +3498,14 @@ int whisper_full( memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0])); } - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; + ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; } } for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) { - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_real_start_sample_us = ggml_real_time_us(); + const int64_t t_process_start_sample_us = ggml_process_time_us(); // store the KV caches of all decoders when doing beam-search if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) { @@ -3672,7 +3698,8 @@ int whisper_full( } } - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; + ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; // obtain logits for the next token for (int j = 0; j < n_decoders_cur; ++j) { @@ -3693,13 +3720,15 @@ int whisper_full( } { - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_real_start_sample_us = ggml_real_time_us(); + const int64_t t_process_start_sample_us = ggml_process_time_us(); whisper_process_logits(*ctx, params, decoder, t_cur); ++decoder.kv_self.n; - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; + ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; } } } @@ -3992,10 +4021,14 @@ int whisper_full_parallel( } } - ctx->t_mel_us += ctxs[i].t_mel_us; - ctx->t_sample_us += ctxs[i].t_sample_us; - ctx->t_encode_us += ctxs[i].t_encode_us; - ctx->t_decode_us += ctxs[i].t_decode_us; + ctx->t_real_mel_us += ctxs[i].t_real_mel_us; + ctx->t_process_mel_us += ctxs[i].t_process_mel_us; + ctx->t_real_sample_us += ctxs[i].t_real_sample_us; + ctx->t_process_sample_us += ctxs[i].t_process_sample_us; + ctx->t_real_encode_us += ctxs[i].t_real_encode_us; + ctx->t_process_encode_us += ctxs[i].t_process_encode_us; + ctx->t_real_decode_us += ctxs[i].t_real_decode_us; + ctx->t_process_decode_us += ctxs[i].t_process_decode_us; kv_cache_free(ctx->kv_cross); @@ -4005,10 +4038,14 @@ int whisper_full_parallel( } // average the timings - ctx->t_mel_us /= n_processors; - ctx->t_sample_us /= n_processors; - ctx->t_encode_us /= n_processors; - ctx->t_decode_us /= n_processors; + ctx->t_real_mel_us /= n_processors; + ctx->t_process_mel_us /= n_processors; + ctx->t_real_sample_us /= n_processors; + ctx->t_process_sample_us /= n_processors; + ctx->t_real_encode_us /= n_processors; + ctx->t_process_encode_us /= n_processors; + ctx->t_real_decode_us /= n_processors; + ctx->t_process_decode_us /= n_processors; // print information about the audio boundaries fprintf(stderr, "\n");