diff --git a/ggml.c b/ggml.c index d67612c..465f92e 100644 --- a/ggml.c +++ b/ggml.c @@ -79,7 +79,7 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif -/*#define GGML_PERF*/ +#define GGML_PERF #define GGML_DEBUG 0 #define GGML_GELU_FP16 diff --git a/whisper.cpp b/whisper.cpp index 14b04d7..1b72966 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2160,6 +2160,12 @@ static bool whisper_decode_internal( ggml_graph_compute (ctx0, &gf); } + // print the time for computing the last ggml_mul_mat that computes logits + // also print the total decoder time + // these need to be called after ggml_graph_compute() + printf("logits t = %7.3f ms (%2d runs, N = %3d, ggml_mul_mat: [%d x %d] * [%d x %d])\n", 1e-3*double(logits->perf_time_us)/logits->perf_runs, logits->perf_runs, N, logits->ne[0], logits->ne[1], cur->ne[1], cur->ne[0]); + printf("total t = %7.3f ms (%2d runs)\n", 1e-3*double(gf.perf_time_us)/gf.perf_runs, gf.perf_runs); + // extract logits for all N tokens //logits_out.resize(N*n_vocab); //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);