From d573de4f94f26dda7316d91ed79e1622707264d9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 15 Jan 2023 12:27:19 +0200 Subject: [PATCH] Cosmetic stuff --- extra/bench-all.sh | 54 +++++++-------- ggml.c | 162 ++++++++++++++++++++++----------------------- ggml.h | 16 ++--- whisper.cpp | 148 ++++++++++++++++++++--------------------- 4 files changed, 189 insertions(+), 191 deletions(-) diff --git a/extra/bench-all.sh b/extra/bench-all.sh index b038894..7c0325c 100755 --- a/extra/bench-all.sh +++ b/extra/bench-all.sh @@ -19,22 +19,23 @@ printf "\n" cat >&2 << EOF How to interpret these results: -- CPU is your CPU model. -- OS is your current operating system. -- Model is the GGML model being benchmarked. -- Threads is the number of threads used. -- Total Load is the total amount of CPU time whisper took to load the model. -- Real Load is the real amount of time your computer took to load the model. -- Total Encode is the total amount of CPU time whisper took to encode the input. - If you're using multiple threads, the time spent in each thread will be added together. -- Real Encode is the real amount of time your computer took to encode the input. - This should be approximately (Total Encode / Threads). If it isn't, you likely have another program making use of the CPU. +- CPU is your CPU model +- OS is your current operating system +- Model is the GGML model being benchmarked +- Threads is the number of threads used +- Load is the time your computer took to load the model +- Encode is the time it took to run the Whisper encoder +- Time is reported as (real / process): + real: This is the wall-clock time in ms + process: This the CPU time. If you're using multiple threads, the time spent in each thread will be added together. + The proces time should be approximately (Proc Enc. / Threads). + If it isn't, you likely have another program making use of the CPU - Commit is the current git commit. EOF -printf "| CPU | OS | Config | Model | Threads | Total Load | Real Load | Total Encode | Real Encode | Commit |\n" -printf "| ------- | ------ | ---------------- | ------- | ------- | ---------- | --------- | ------------ | ----------- | -------- |\n" +printf "| CPU | OS | Config | Model | Threads | Load | Encode | Commit |\n" +printf "| ------- | ------ | ---------------- | ------- | ------- | ----- | ------------- | -------- |\n" for model in "${models[@]}"; do # run once to heat-up the cache @@ -45,31 +46,32 @@ for model in "${models[@]}"; do output=$(./bench -m "./models/ggml-$model.bin" -t "$n_threads" 2>&1) # parse the output: - total_load_time=$(echo "$output" | grep "load time" | awk '{print $5}') - real_load_time=$(echo "$output" | grep "load time" | awk '{print $8}') - total_encode_time=$(echo "$output" | grep "encode time" | awk '{print $5}') - real_encode_time=$(echo "$output" | grep "encode time" | awk '{print $8}') + load_proc=$(echo "$output" | grep "load time" | awk '{print $8}') + load_real=$(echo "$output" | grep "load time" | awk '{print $5}') + encode_proc=$(echo "$output" | grep "encode time" | awk '{print $8}') + encode_real=$(echo "$output" | grep "encode time" | awk '{print $5}') system_info=$(echo "$output" | grep "system_info") - n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}') + n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}') # floor to milliseconds - total_load_time=${total_load_time%.*} - real_load_time=${real_load_time%.*} - total_encode_time=${total_encode_time%.*} - real_encode_time=${real_encode_time%.*} + load_proc=${load_proc%.*} + load_real=${load_real%.*} + encode_proc=${encode_proc%.*} + encode_real=${encode_real%.*} + + load_str="$load_real" + encode_str="$encode_real / $encode_proc" config=$(echo "$system_info" | sed 's/ | /\n/g' | tail -n +2 | awk '/ = 1/{print $1}' | tr '\n' ' ') commit=$(git rev-parse --short HEAD) - printf "| | | %-16s | %-7s | %-7s | %-10s | %-9s | %-12s | %-11s | %-8s |\n" \ + printf "| | | %-16s | %-7s | %-7s | %-5s | %-13s | %-8s |\n" \ "$config" \ "$model" \ "$n_threads" \ - "$total_load_time" \ - "$real_load_time" \ - "$total_encode_time" \ - "$real_encode_time" \ + "$load_str" \ + "$encode_str" \ "$commit" done diff --git a/ggml.c b/ggml.c index da0313f..12a0dd2 100644 --- a/ggml.c +++ b/ggml.c @@ -288,46 +288,46 @@ void ggml_time_init(void) { timer_freq = frequency.QuadPart; } -int64_t ggml_real_time_ms(void) { +int64_t ggml_time_real_ms(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000) / timer_freq; } -int64_t ggml_real_time_us(void) { +int64_t ggml_time_real_us(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000000) / timer_freq; } -int64_t ggml_process_time_ms(void) { +int64_t ggml_time_proc_ms(void) { return (clock() * 1000) / CLOCKS_PER_SEC; } -int64_t ggml_process_time_us(void) { +int64_t ggml_time_proc_us(void) { return (clock() * 1000000) / CLOCKS_PER_SEC; } #else void ggml_time_init(void) {} -int64_t ggml_real_time_ms(void) { +int64_t ggml_time_real_ms(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t ggml_real_time_us(void) { +int64_t ggml_time_real_us(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; } -int64_t ggml_process_time_ms(void) { +int64_t ggml_time_proc_ms(void) { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t ggml_process_time_us(void) { +int64_t ggml_time_proc_us(void) { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; @@ -335,11 +335,11 @@ int64_t ggml_process_time_us(void) { #endif #ifdef GGML_PERF -#define ggml_perf_real_time_us() ggml_real_time_us() -#define ggml_perf_process_time_us() ggml_process_time_us() +#define GGML_PERF_TIME_REAL_US() ggml_time_real_us() +#define GGML_PERF_TIME_PROC_US() ggml_time_proc_us() #else -#define ggml_perf_real_time_us() 0 -#define ggml_perf_process_time_us() 0 +#define GGML_PERF_TIME_REAL_US() 0 +#define GGML_PERF_TIME_PROC_US() 0 #endif // @@ -1487,7 +1487,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { if (is_first_call) { // initialize GELU, EXP and F32 tables { - const uint64_t t_start = ggml_perf_real_time_us(); UNUSED(t_start); + const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start); ggml_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { @@ -1498,14 +1498,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); } - const uint64_t t_end = ggml_perf_real_time_us(); UNUSED(t_end); + const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } // initialize g_state { - const uint64_t t_start = ggml_perf_real_time_us(); UNUSED(t_start); + const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start); g_state = (struct ggml_state) { /*.contexts =*/ { { 0 } }, @@ -1515,7 +1515,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { g_state.contexts[i].used = false; } - const uint64_t t_end = ggml_perf_real_time_us(); UNUSED(t_end); + const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } @@ -1655,22 +1655,22 @@ struct ggml_tensor * ggml_new_tensor_impl( ggml_assert_aligned(result); *result = (struct ggml_tensor) { - /*.type =*/ type, - /*.n_dims =*/ n_dims, - /*.ne =*/ { 1, 1, 1, 1 }, - /*.nb =*/ { 0, 0, 0, 0 }, - /*.op =*/ GGML_OP_NONE, - /*.is_param =*/ false, - /*.grad =*/ NULL, - /*.src0 =*/ NULL, - /*.src1 =*/ NULL, - /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, - /*.perf_runs =*/ 0, - /*.perf_process_time_us =*/ 0, - /*.perf_real_time_us =*/ 0, - /*.data =*/ data == NULL ? (void *)(result + 1) : data, - /*.pad =*/ { 0 }, + /*.type =*/ type, + /*.n_dims =*/ n_dims, + /*.ne =*/ { 1, 1, 1, 1 }, + /*.nb =*/ { 0, 0, 0, 0 }, + /*.op =*/ GGML_OP_NONE, + /*.is_param =*/ false, + /*.grad =*/ NULL, + /*.src0 =*/ NULL, + /*.src1 =*/ NULL, + /*.opt =*/ { NULL }, + /*.n_tasks =*/ 0, + /*.perf_runs =*/ 0, + /*.perf_time_proc_us =*/ 0, + /*.perf_time_real_us =*/ 0, + /*.data =*/ data == NULL ? (void *)(result + 1) : data, + /*.pad =*/ { 0 }, }; ggml_assert_aligned(result->data); @@ -4317,7 +4317,7 @@ static void ggml_compute_forward_mul_mat_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -4411,7 +4411,7 @@ static void ggml_compute_forward_mul_mat_f32( } } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } @@ -4541,7 +4541,7 @@ static void ggml_compute_forward_mul_mat_f32( } } - //int64_t t1 = ggml_perf_process_time_us(); + //int64_t t1 = GGML_PERF_TIME_PROC_US(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -4560,7 +4560,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -4692,7 +4692,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } @@ -4846,7 +4846,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } - //int64_t t1 = ggml_perf_real_time_us(); + //int64_t t1 = GGML_PERF_TIME_REAL_US(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -5314,7 +5314,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5434,7 +5434,7 @@ static void ggml_compute_forward_conv_1d_1s_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5580,7 +5580,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5700,7 +5700,7 @@ static void ggml_compute_forward_conv_1d_2s_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5844,7 +5844,7 @@ static void ggml_compute_forward_flash_attn_f32( const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int neq0 = q->ne[0]; @@ -6053,7 +6053,7 @@ static void ggml_compute_forward_flash_attn_f16( const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int neq0 = q->ne[0]; @@ -6328,7 +6328,7 @@ static void ggml_compute_forward_flash_ff_f16( const struct ggml_tensor * c0, // F16 proj_w const struct ggml_tensor * c1, // F32 proj_b struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_process_time_us(); + int64_t t0 = GGML_PERF_TIME_PROC_US(); UNUSED(t0); const int nea0 = a->ne[0]; @@ -6998,17 +6998,17 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { - /*.n_nodes =*/ 0, - /*.n_leafs =*/ 0, - /*.n_threads =*/ 0, - /*.work_size =*/ 0, - /*.work =*/ NULL, - /*.nodes =*/ { NULL }, - /*.grads =*/ { NULL }, - /*.leafs =*/ { NULL }, - /*.perf_runs =*/ 0, - /*.perf_process_time_us =*/ 0, - /*.perf_real_time_us =*/ 0, + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.n_threads =*/ 0, + /*.work_size =*/ 0, + /*.work =*/ NULL, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_time_proc_us =*/ 0, + /*.perf_time_real_us =*/ 0, }; ggml_build_forward_impl(&result, tensor, false); @@ -7414,8 +7414,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } - const int64_t perf_process_time_start_us = ggml_perf_process_time_us(); - const int64_t perf_real_time_start_us = ggml_perf_real_time_us(); + const int64_t perf_time_proc_start_us = GGML_PERF_TIME_PROC_US(); + const int64_t perf_time_real_start_us = GGML_PERF_TIME_REAL_US(); for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); @@ -7427,8 +7427,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // continue; //} - const int64_t perf_node_process_time_start_us = ggml_perf_process_time_us(); - const int64_t perf_node_real_time_start_us = ggml_perf_real_time_us(); + const int64_t perf_node_time_proc_start_us = GGML_PERF_TIME_PROC_US(); + const int64_t perf_node_time_real_start_us = GGML_PERF_TIME_REAL_US(); // INIT struct ggml_compute_params params = { @@ -7553,12 +7553,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // performance stats (node) { - int64_t perf_cur_process_time_us = ggml_perf_process_time_us() - perf_node_process_time_start_us; - int64_t perf_cur_real_time_us = ggml_perf_real_time_us() - perf_node_real_time_start_us; + int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_node_time_proc_start_us; + int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_node_time_real_start_us; node->perf_runs++; - node->perf_process_time_us += perf_cur_process_time_us; - node->perf_real_time_us += perf_cur_real_time_us; + node->perf_time_proc_us += perf_cur_time_proc_us; + node->perf_time_real_us += perf_cur_time_real_us; } } @@ -7578,19 +7578,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // performance stats (graph) { - int64_t perf_cur_process_time_us = ggml_perf_process_time_us() - perf_process_time_start_us; - int64_t perf_cur_real_time_us = ggml_perf_real_time_us() - perf_real_time_start_us; + int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_time_proc_start_us; + int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_time_real_start_us; cgraph->perf_runs++; - cgraph->perf_process_time_us += perf_cur_process_time_us; - cgraph->perf_real_time_us += perf_cur_real_time_us; + cgraph->perf_time_proc_us += perf_cur_time_proc_us; + cgraph->perf_time_real_us += perf_cur_time_real_us; GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs, - (double) perf_cur_process_time_us / 1000.0, - (double) cgraph->perf_process_time_us / 1000.0 / (double) cgraph->perf_runs, - (double) perf_cur_real_time_us / 1000.0, - (double) cgraph->perf_real_time_us / 1000.0 / (double) cgraph->perf_runs); + (double) perf_cur_time_proc_us / 1000.0, + (double) cgraph->perf_time_proc_us / 1000.0 / (double) cgraph->perf_runs, + (double) perf_cur_time_real_us / 1000.0, + (double) cgraph->perf_time_real_us / 1000.0 / (double) cgraph->perf_runs); } } @@ -7616,16 +7616,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - perf_total_per_op_us[node->op] += node->perf_real_time_us; + perf_total_per_op_us[node->op] += node->perf_time_real_us; GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, - (double) node->perf_process_time_us / 1000.0, - (double) node->perf_process_time_us / 1000.0 / (double) node->perf_runs, - (double) node->perf_real_time_us / 1000.0, - (double) node->perf_real_time_us / 1000.0 / node->perf_runs); + (double) node->perf_time_proc_us / 1000.0, + (double) node->perf_time_proc_us / 1000.0 / (double) node->perf_runs, + (double) node->perf_time_real_us / 1000.0, + (double) node->perf_time_real_us / 1000.0 / node->perf_runs); } GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); @@ -7904,8 +7904,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); } - const int64_t t_real_start_us = ggml_perf_real_time_us(); - const int64_t t_process_start_us = ggml_perf_process_time_us(); + const int64_t t_real_start_us = GGML_PERF_TIME_REAL_US(); + const int64_t t_process_start_us = GGML_PERF_TIME_PROC_US(); UNUSED(t_real_start_us); UNUSED(t_process_start_us); @@ -7987,11 +7987,11 @@ static enum ggml_opt_result ggml_opt_adam( fx_prev = fx; { - const int64_t t_process_end_us = ggml_perf_process_time_us(); + const int64_t t_process_end_us = GGML_PERF_TIME_PROC_US(); GGML_PRINT_DEBUG("time iter: %5.3f s\n", (t_process_end_us - t_process_start_us)/1e6); UNUSED(t_process_end_us); - const int64_t t_real_end_us = ggml_perf_real_time_us(); + const int64_t t_real_end_us = GGML_PERF_TIME_REAL_US(); GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_real_end_us - t_real_start_us)/1e6); UNUSED(t_real_end_us); } diff --git a/ggml.h b/ggml.h index f8febe6..bc3d673 100644 --- a/ggml.h +++ b/ggml.h @@ -275,8 +275,8 @@ struct ggml_tensor { // performance int perf_runs; - int64_t perf_process_time_us; - int64_t perf_real_time_us; + int64_t perf_time_proc_us; + int64_t perf_time_real_us; void * data; char padding[8]; @@ -297,8 +297,8 @@ struct ggml_cgraph { // performance int perf_runs; - int64_t perf_process_time_us; - int64_t perf_real_time_us; + int64_t perf_time_proc_us; + int64_t perf_time_real_us; }; struct ggml_init_params { @@ -308,10 +308,10 @@ struct ggml_init_params { }; void ggml_time_init(void); // call this once at the beginning of the program -int64_t ggml_real_time_ms(void); -int64_t ggml_real_time_us(void); -int64_t ggml_process_time_ms(void); -int64_t ggml_process_time_us(void); +int64_t ggml_time_real_ms(void); +int64_t ggml_time_real_us(void); +int64_t ggml_time_proc_ms(void); +int64_t ggml_time_proc_us(void); void ggml_print_object (const struct ggml_object * obj); void ggml_print_objects(const struct ggml_context * ctx); diff --git a/whisper.cpp b/whisper.cpp index 034b12c..57731e4 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -467,18 +467,18 @@ struct whisper_decoder { }; struct whisper_context { - int64_t t_real_load_us = 0; - int64_t t_process_load_us = 0; - int64_t t_real_mel_us = 0; - int64_t t_process_mel_us = 0; - int64_t t_real_sample_us = 0; - int64_t t_process_sample_us = 0; - int64_t t_real_encode_us = 0; - int64_t t_process_encode_us = 0; - int64_t t_real_decode_us = 0; - int64_t t_process_decode_us = 0; - int64_t t_real_start_us = 0; - int64_t t_process_start_us = 0; + int64_t t_load_real_us = 0; + int64_t t_load_proc_us = 0; + int64_t t_mel_real_us = 0; + int64_t t_mel_proc_us = 0; + int64_t t_sample_real_us = 0; + int64_t t_sample_proc_us = 0; + int64_t t_encode_real_us = 0; + int64_t t_encode_proc_us = 0; + int64_t t_decode_real_us = 0; + int64_t t_decode_proc_us = 0; + int64_t t_start_real_us = 0; + int64_t t_start_proc_us = 0; ggml_type wtype; // weight type (FP32 or FP16) @@ -603,11 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) { static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) { fprintf(stderr, "%s: loading model\n", __func__); - const int64_t t_real_start_us = ggml_real_time_us(); - const int64_t t_process_start_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); - wctx.t_real_start_us = t_real_start_us; - wctx.t_process_start_us = t_process_start_us; + wctx.t_start_real_us = t_start_real_us; + wctx.t_start_proc_us = t_start_proc_us; auto & model = wctx.model; auto & vocab = wctx.vocab; @@ -1216,8 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con wctx.rng = std::mt19937(0); - wctx.t_real_load_us = ggml_real_time_us() - t_real_start_us; - wctx.t_process_load_us = ggml_process_time_us() - t_process_start_us; + wctx.t_load_real_us = ggml_time_real_us() - t_start_real_us; + wctx.t_load_proc_us = ggml_time_proc_us() - t_start_proc_us; return true; } @@ -1235,8 +1235,8 @@ static bool whisper_encode( whisper_context & wctx, const int mel_offset, const int n_threads) { - const int64_t t_real_start_us = ggml_real_time_us(); - const int64_t t_process_start_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); const auto & model = wctx.model; const auto & mel_inp = wctx.mel; @@ -1629,8 +1629,8 @@ static bool whisper_encode( ggml_free(ctx0); - wctx.t_real_encode_us += ggml_real_time_us() - t_real_start_us; - wctx.t_process_encode_us += ggml_process_time_us() - t_process_start_us; + wctx.t_encode_real_us += ggml_time_real_us() - t_start_real_us; + wctx.t_encode_proc_us += ggml_time_proc_us() - t_start_proc_us; return true; } @@ -1652,8 +1652,8 @@ static bool whisper_decode( const int n_tokens, const int n_past, const int n_threads) { - const int64_t t_real_start_us = ggml_real_time_us(); - const int64_t t_process_start_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); const auto & model = wctx.model; const auto & hparams = model.hparams; @@ -2004,8 +2004,8 @@ static bool whisper_decode( ggml_free(ctx0); - wctx.t_real_decode_us += ggml_real_time_us() - t_real_start_us; - wctx.t_process_decode_us += ggml_process_time_us() - t_process_start_us; + wctx.t_decode_real_us += ggml_time_real_us() - t_start_real_us; + wctx.t_decode_proc_us += ggml_time_proc_us() - t_start_proc_us; return true; } @@ -2120,8 +2120,8 @@ static bool log_mel_spectrogram( const whisper_filters & filters, const bool speed_up, whisper_mel & mel) { - const int64_t t_real_start_us = ggml_real_time_us(); - const int64_t t_process_start_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); // Hanning window std::vector hann; @@ -2230,8 +2230,8 @@ static bool log_mel_spectrogram( mel.data[i] = (mel.data[i] + 4.0)/4.0; } - wctx.t_real_mel_us = ggml_real_time_us() - t_real_start_us; - wctx.t_process_mel_us = ggml_process_time_us() - t_process_start_us; + wctx.t_mel_real_us = ggml_time_real_us() - t_start_real_us; + wctx.t_mel_proc_us = ggml_time_proc_us() - t_start_proc_us; return true; } @@ -2657,29 +2657,25 @@ whisper_token whisper_token_transcribe(void) { } void whisper_print_timings(struct whisper_context * ctx) { - const int64_t t_real_end_us = ggml_real_time_us(); - const int64_t t_process_end_us = ggml_process_time_us(); + const int64_t t_real_end_us = ggml_time_real_us(); + const int64_t t_proc_end_us = ggml_time_proc_us(); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_load_us/1000.0f, ctx->t_real_load_us/1000.0f); - fprintf(stderr, "%s: mel time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_mel_us/1000.0f, ctx->t_real_mel_us/1000.0f); - fprintf(stderr, "%s: sample time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_sample_us/1000.0f, ctx->t_real_sample_us/1000.0f); - fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__, - ctx->t_process_encode_us/1000.0f, ctx->t_process_encode_us/1000.0f/ctx->model.hparams.n_audio_layer, - ctx->t_real_encode_us/1000.0f, ctx->t_real_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); - fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__, - ctx->t_process_decode_us/1000.0f, ctx->t_process_decode_us/1000.0f/ctx->model.hparams.n_text_layer, - ctx->t_real_decode_us/1000.0f, ctx->t_real_decode_us/1000.0f/ctx->model.hparams.n_text_layer); - fprintf(stderr, "%s: total time = %8.2f ms (real %8.2f ms)\n", __func__, (t_process_end_us - ctx->t_process_start_us)/1000.f, (t_real_end_us - ctx->t_real_start_us)/1000.0f); + fprintf(stderr, "%s: load time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_load_real_us/1000.0f, ctx->t_load_proc_us/1000.0f); + fprintf(stderr, "%s: mel time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_mel_real_us/1000.0f, ctx->t_mel_proc_us/1000.0f); + fprintf(stderr, "%s: sample time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_sample_real_us/1000.0f, ctx->t_sample_proc_us/1000.0f); + fprintf(stderr, "%s: encode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_encode_real_us/1000.0f, ctx->t_encode_proc_us/1000.0f); + fprintf(stderr, "%s: decode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_decode_real_us/1000.0f, ctx->t_decode_proc_us/1000.0f); + fprintf(stderr, "%s: total time = %8.2f ms (proc %8.2f ms)\n", __func__, (t_real_end_us - ctx->t_start_real_us)/1000.f, (t_proc_end_us - ctx->t_start_proc_us)/1000.0f); } void whisper_reset_timings(struct whisper_context * ctx) { - ctx->t_real_sample_us = 0; - ctx->t_process_sample_us = 0; - ctx->t_real_encode_us = 0; - ctx->t_process_encode_us = 0; - ctx->t_real_decode_us = 0; - ctx->t_process_decode_us = 0; + ctx->t_sample_real_us = 0; + ctx->t_sample_proc_us = 0; + ctx->t_encode_real_us = 0; + ctx->t_encode_proc_us = 0; + ctx->t_decode_real_us = 0; + ctx->t_decode_proc_us = 0; } const char * whisper_print_system_info(void) { @@ -3478,8 +3474,8 @@ int whisper_full( } { - const int64_t t_real_start_sample_us = ggml_real_time_us(); - const int64_t t_process_start_sample_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur); @@ -3498,14 +3494,14 @@ int whisper_full( memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0])); } - ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; - ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; + ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us; + ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us; } } for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) { - const int64_t t_real_start_sample_us = ggml_real_time_us(); - const int64_t t_process_start_sample_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); // store the KV caches of all decoders when doing beam-search if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) { @@ -3698,8 +3694,8 @@ int whisper_full( } } - ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; - ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; + ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us; + ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us; // obtain logits for the next token for (int j = 0; j < n_decoders_cur; ++j) { @@ -3720,15 +3716,15 @@ int whisper_full( } { - const int64_t t_real_start_sample_us = ggml_real_time_us(); - const int64_t t_process_start_sample_us = ggml_process_time_us(); + const int64_t t_start_real_us = ggml_time_real_us(); + const int64_t t_start_proc_us = ggml_time_proc_us(); whisper_process_logits(*ctx, params, decoder, t_cur); ++decoder.kv_self.n; - ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us; - ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us; + ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us; + ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us; } } } @@ -4021,14 +4017,14 @@ int whisper_full_parallel( } } - ctx->t_real_mel_us += ctxs[i].t_real_mel_us; - ctx->t_process_mel_us += ctxs[i].t_process_mel_us; - ctx->t_real_sample_us += ctxs[i].t_real_sample_us; - ctx->t_process_sample_us += ctxs[i].t_process_sample_us; - ctx->t_real_encode_us += ctxs[i].t_real_encode_us; - ctx->t_process_encode_us += ctxs[i].t_process_encode_us; - ctx->t_real_decode_us += ctxs[i].t_real_decode_us; - ctx->t_process_decode_us += ctxs[i].t_process_decode_us; + ctx->t_mel_real_us += ctxs[i].t_mel_real_us; + ctx->t_mel_proc_us += ctxs[i].t_mel_proc_us; + ctx->t_sample_real_us += ctxs[i].t_sample_real_us; + ctx->t_sample_proc_us += ctxs[i].t_sample_proc_us; + ctx->t_encode_real_us += ctxs[i].t_encode_real_us; + ctx->t_encode_proc_us += ctxs[i].t_encode_proc_us; + ctx->t_decode_real_us += ctxs[i].t_decode_real_us; + ctx->t_decode_proc_us += ctxs[i].t_decode_proc_us; kv_cache_free(ctx->kv_cross); @@ -4038,14 +4034,14 @@ int whisper_full_parallel( } // average the timings - ctx->t_real_mel_us /= n_processors; - ctx->t_process_mel_us /= n_processors; - ctx->t_real_sample_us /= n_processors; - ctx->t_process_sample_us /= n_processors; - ctx->t_real_encode_us /= n_processors; - ctx->t_process_encode_us /= n_processors; - ctx->t_real_decode_us /= n_processors; - ctx->t_process_decode_us /= n_processors; + ctx->t_mel_real_us /= n_processors; + ctx->t_mel_proc_us /= n_processors; + ctx->t_sample_real_us /= n_processors; + ctx->t_sample_proc_us /= n_processors; + ctx->t_encode_real_us /= n_processors; + ctx->t_encode_proc_us /= n_processors; + ctx->t_decode_real_us /= n_processors; + ctx->t_decode_proc_us /= n_processors; // print information about the audio boundaries fprintf(stderr, "\n");