From bde7ee3751ba167e7ba38ab8acaf04100e24815a Mon Sep 17 00:00:00 2001 From: Abitofevrything Date: Sun, 8 Jan 2023 10:26:58 +0100 Subject: [PATCH] Remove ggml_cycles and ggml_perf timings; replace with ggml_process_time --- examples/talk.wasm/gpt-2.cpp | 4 +- examples/talk/gpt-2.cpp | 4 +- ggml.c | 136 +++++++++++++++++------------------ ggml.h | 16 ++--- whisper.cpp | 2 +- 5 files changed, 77 insertions(+), 85 deletions(-) diff --git a/examples/talk.wasm/gpt-2.cpp b/examples/talk.wasm/gpt-2.cpp index bc5e099..189cc31 100644 --- a/examples/talk.wasm/gpt-2.cpp +++ b/examples/talk.wasm/gpt-2.cpp @@ -837,14 +837,14 @@ struct gpt2_context * gpt2_init(const char * path_model) { // load the model { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = ggml_real_time_us(); if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin"); return nullptr; } - const int64_t t_load_us = ggml_time_us() - t_start_us; + const int64_t t_load_us = ggml_real_time_us() - t_start_us; printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000)); } diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp index c28644d..f8a4be8 100644 --- a/examples/talk/gpt-2.cpp +++ b/examples/talk/gpt-2.cpp @@ -837,7 +837,7 @@ struct gpt2_context * gpt2_init(const char * path_model) { // load the model { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = ggml_real_time_us(); if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model); @@ -845,7 +845,7 @@ struct gpt2_context * gpt2_init(const char * path_model) { return nullptr; } - const int64_t t_load_us = ggml_time_us() - t_start_us; + const int64_t t_load_us = ggml_real_time_us() - t_start_us; printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000)); } diff --git a/ggml.c b/ggml.c index c59ee64..a0eb09a 100644 --- a/ggml.c +++ b/ggml.c @@ -287,49 +287,41 @@ void ggml_time_init(void) { QueryPerformanceFrequency(&frequency); timer_freq = frequency.QuadPart; } -int64_t ggml_time_ms(void) { +int64_t ggml_real_time_ms(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000) / timer_freq; } -int64_t ggml_time_us(void) { +int64_t ggml_real_time_us(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000000) / timer_freq; } #else void ggml_time_init(void) {} -int64_t ggml_time_ms(void) { +int64_t ggml_real_time_ms(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t ggml_time_us(void) { +int64_t ggml_real_time_us(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; } -#endif -int64_t ggml_cycles(void) { - return clock(); +int64_t ggml_process_time_ms(void) { + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t ggml_cycles_per_ms(void) { - return CLOCKS_PER_SEC/1000; +int64_t ggml_process_time_us(void) { + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; } - -#ifdef GGML_PERF -#define ggml_perf_time_ms() ggml_time_ms() -#define ggml_perf_time_us() ggml_time_us() -#define ggml_perf_cycles() ggml_cycles() -#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() -#else -#define ggml_perf_time_ms() 0 -#define ggml_perf_time_us() 0 -#define ggml_perf_cycles() 0 -#define ggml_perf_cycles_per_ms() 0 #endif // @@ -1477,7 +1469,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { if (is_first_call) { // initialize GELU, EXP and F32 tables { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_real_time_us(); UNUSED(t_start); ggml_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { @@ -1488,14 +1480,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); } - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_real_time_us(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } // initialize g_state { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_real_time_us(); UNUSED(t_start); g_state = (struct ggml_state) { /*.contexts =*/ { { 0 } }, @@ -1505,7 +1497,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { g_state.contexts[i].used = false; } - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_real_time_us(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } @@ -1657,8 +1649,8 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.opt =*/ { NULL }, /*.n_tasks =*/ 0, /*.perf_runs =*/ 0, - /*.perf_cycles =*/ 0, - /*.perf_time_us =*/ 0, + /*.perf_process_time_us =*/ 0, + /*.perf_real_time_us =*/ 0, /*.data =*/ data == NULL ? (void *)(result + 1) : data, /*.pad =*/ { 0 }, }; @@ -4307,7 +4299,7 @@ static void ggml_compute_forward_mul_mat_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -4401,7 +4393,7 @@ static void ggml_compute_forward_mul_mat_f32( } } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } @@ -4531,7 +4523,7 @@ static void ggml_compute_forward_mul_mat_f32( } } - //int64_t t1 = ggml_perf_time_us(); + //int64_t t1 = ggml_process_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -4550,7 +4542,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -4682,7 +4674,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } @@ -4836,7 +4828,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } - //int64_t t1 = ggml_time_us(); + //int64_t t1 = ggml_real_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -5304,7 +5296,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5424,7 +5416,7 @@ static void ggml_compute_forward_conv_1d_1s_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5570,7 +5562,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5690,7 +5682,7 @@ static void ggml_compute_forward_conv_1d_2s_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int ne00 = src0->ne[0]; @@ -5834,7 +5826,7 @@ static void ggml_compute_forward_flash_attn_f32( const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int neq0 = q->ne[0]; @@ -6043,7 +6035,7 @@ static void ggml_compute_forward_flash_attn_f16( const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int neq0 = q->ne[0]; @@ -6318,7 +6310,7 @@ static void ggml_compute_forward_flash_ff_f16( const struct ggml_tensor * c0, // F16 proj_w const struct ggml_tensor * c1, // F32 proj_b struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_process_time_us(); UNUSED(t0); const int nea0 = a->ne[0]; @@ -6997,8 +6989,8 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, /*.perf_runs =*/ 0, - /*.perf_cycles =*/ 0, - /*.perf_time_us =*/ 0, + /*.perf_process_time_us =*/ 0, + /*.perf_real_time_us =*/ 0, }; ggml_build_forward_impl(&result, tensor, false); @@ -7404,8 +7396,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } - const int64_t perf_start_cycles = ggml_perf_cycles(); - const int64_t perf_start_time_us = ggml_perf_time_us(); + const int64_t perf_process_time_start_us = ggml_process_time_us(); + const int64_t perf_real_time_start_us = ggml_real_time_us(); for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); @@ -7417,8 +7409,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // continue; //} - const int64_t perf_node_start_cycles = ggml_perf_cycles(); - const int64_t perf_node_start_time_us = ggml_perf_time_us(); + const int64_t perf_node_process_time_start_us = ggml_process_time_us(); + const int64_t perf_node_real_time_start_us = ggml_real_time_us(); // INIT struct ggml_compute_params params = { @@ -7543,12 +7535,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // performance stats (node) { - int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles; - int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us; + int64_t perf_cur_process_time_us = ggml_process_time_us() - perf_node_process_time_start_us; + int64_t perf_cur_real_time_us = ggml_real_time_us() - perf_node_real_time_start_us; node->perf_runs++; - node->perf_cycles += perf_cycles_cur; - node->perf_time_us += perf_time_us_cur; + node->perf_process_time_us += perf_cur_process_time_us; + node->perf_real_time_us += perf_cur_real_time_us; } } @@ -7568,19 +7560,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // performance stats (graph) { - int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; - int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; + int64_t perf_cur_process_time_us = ggml_process_time_us() - perf_process_time_start_us; + int64_t perf_cur_real_time_us = ggml_real_time_us() - perf_real_time_start_us; cgraph->perf_runs++; - cgraph->perf_cycles += perf_cycles_cur; - cgraph->perf_time_us += perf_time_us_cur; + cgraph->perf_process_time_us += perf_cur_process_time_us; + cgraph->perf_real_time_us += perf_cur_real_time_us; GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs, - (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), - (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, - (double) perf_time_us_cur / 1000.0, - (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); + (double) perf_cur_process_time_us / 1000.0, + (double) cgraph->perf_process_time_us / 1000.0 / (double) cgraph->perf_runs, + (double) perf_cur_real_time_us / 1000.0, + (double) cgraph->perf_real_time_us / 1000.0 / (double) cgraph->perf_runs); } } @@ -7606,16 +7598,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - perf_total_per_op_us[node->op] += node->perf_time_us; + perf_total_per_op_us[node->op] += node->perf_real_time_us; GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, - (double) node->perf_cycles / (double) ggml_cycles_per_ms(), - (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, - (double) node->perf_time_us / 1000.0, - (double) node->perf_time_us / 1000.0 / node->perf_runs); + (double) node->perf_process_time_us / 1000.0, + (double) node->perf_process_time_us / 1000.0 / (double) node->perf_runs, + (double) node->perf_real_time_us / 1000.0, + (double) node->perf_real_time_us / 1000.0 / node->perf_runs); } GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); @@ -7894,10 +7886,10 @@ static enum ggml_opt_result ggml_opt_adam( ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); } - const int64_t t_start_wall = ggml_time_us(); - const int64_t t_start_cpu = ggml_cycles(); - UNUSED(t_start_wall); - UNUSED(t_start_cpu); + const int64_t t_real_start_us = ggml_real_time_us(); + const int64_t t_process_start_us = ggml_process_time_us(); + UNUSED(t_real_start_us); + UNUSED(t_process_start_us); { // update the gradient @@ -7977,13 +7969,13 @@ static enum ggml_opt_result ggml_opt_adam( fx_prev = fx; { - const int64_t t_end_cpu = ggml_cycles(); - GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); - UNUSED(t_end_cpu); + const int64_t t_process_end_us = ggml_process_time_us(); + GGML_PRINT_DEBUG("time iter: %5.3f s\n", (t_process_end_us - t_process_start_us)/1e6); + UNUSED(t_process_end_us); - const int64_t t_end_wall = ggml_time_us(); - GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); - UNUSED(t_end_wall); + const int64_t t_real_end_us = ggml_real_time_us(); + GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_real_end_us - t_real_start_us)/1e6); + UNUSED(t_real_end_us); } } diff --git a/ggml.h b/ggml.h index f3c9e5a..f8febe6 100644 --- a/ggml.h +++ b/ggml.h @@ -275,8 +275,8 @@ struct ggml_tensor { // performance int perf_runs; - int64_t perf_cycles; - int64_t perf_time_us; + int64_t perf_process_time_us; + int64_t perf_real_time_us; void * data; char padding[8]; @@ -297,8 +297,8 @@ struct ggml_cgraph { // performance int perf_runs; - int64_t perf_cycles; - int64_t perf_time_us; + int64_t perf_process_time_us; + int64_t perf_real_time_us; }; struct ggml_init_params { @@ -308,10 +308,10 @@ struct ggml_init_params { }; void ggml_time_init(void); // call this once at the beginning of the program -int64_t ggml_time_ms(void); -int64_t ggml_time_us(void); -int64_t ggml_cycles(void); -int64_t ggml_cycles_per_ms(void); +int64_t ggml_real_time_ms(void); +int64_t ggml_real_time_us(void); +int64_t ggml_process_time_ms(void); +int64_t ggml_process_time_us(void); void ggml_print_object (const struct ggml_object * obj); void ggml_print_objects(const struct ggml_context * ctx); diff --git a/whisper.cpp b/whisper.cpp index c400856..b74c515 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2642,7 +2642,7 @@ whisper_token whisper_token_transcribe(void) { } void whisper_print_timings(struct whisper_context * ctx) { - const int64_t t_end_us = ggml_time_us(); + const int64_t t_end_us = ggml_real_time_us(); fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);