Merge 5567eaaa1c into 1a91c19af9

2 years ago · 13204489d7
parent 1a91c19af9 5567eaaa1c
commit 13204489d7
6 changed files with 245 additions and 165 deletions
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -837,14 +837,14 @@ struct gpt2_context * gpt2_init(const char * path_model) {

    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_real_time_us();

        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
            return nullptr;
        }

-        const int64_t t_load_us = ggml_time_us() - t_start_us;
+        const int64_t t_load_us = ggml_real_time_us() - t_start_us;

        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
    }
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -837,7 +837,7 @@ struct gpt2_context * gpt2_init(const char * path_model) {

    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_time_real_us();

        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
@ -845,7 +845,7 @@ struct gpt2_context * gpt2_init(const char * path_model) {
            return nullptr;
        }

-        const int64_t t_load_us = ggml_time_us() - t_start_us;
+        const int64_t t_load_us = ggml_time_real_us() - t_start_us;

        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
    }
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -29,42 +29,60 @@ printf "Running benchmark for all models\n"
 printf "This can take a while!\n"
 printf "\n"

-printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
-printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
+cat >&2 << EOF
+How to interpret these results:
+- CPU is your CPU model
+- OS is your current operating system
+- Model is the GGML model being benchmarked
+- Threads is the number of threads used
+- Load is the time your computer took to load the model
+- Encode is the time it took to run the Whisper encoder
+- Time is reported as (real / process):
+  real:    This is the wall-clock time in ms
+  process: This the CPU time. If you're using multiple threads, the time spent in each thread will be added together.
+           The proces time should be approximately (Proc Enc. / Threads).
+           If it isn't, you likely have another program making use of the CPU
+- Commit is the current git commit.
+
+EOF
+
+printf "|   CPU   |   OS   |      Config      |  Model  | Th | Load  | Encode        |  Commit  |\n"
+printf "| ------- | ------ | ---------------- | ------- | -- | ----- | ------------- | -------- |\n"

 for model in "${models[@]}"; do
    # run once to heat-up the cache
-    ./bench -m ./models/ggml-$model.bin -t $n_threads 2>/dev/null 1>/dev/null
+    ./bench -m "./models/ggml-$model.bin" -t "$n_threads" 2>/dev/null 1>/dev/null

    # actual run
    # store stderr output in a variable in order to parse it later
-    output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
+    output=$(./bench -m "./models/ggml-$model.bin" -t "$n_threads" 2>&1)

    # parse the output:
-    load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
-    encode_time=$(echo "$output" | grep "encode time" | awk '{print $5}')
+    load_proc=$(echo   "$output" | grep "load time"   | awk '{print $8}')
+    load_real=$(echo   "$output" | grep "load time"   | awk '{print $5}')
+    encode_proc=$(echo "$output" | grep "encode time" | awk '{print $8}')
+    encode_real=$(echo "$output" | grep "encode time" | awk '{print $5}')
    system_info=$(echo "$output" | grep "system_info")
    n_threads=$(echo   "$output" | grep "system_info" | awk '{print $4}')

    # floor to milliseconds
-    load_time=${load_time%.*}
-    encode_time=${encode_time%.*}
+    load_proc=${load_proc%.*}
+    load_real=${load_real%.*}
+    encode_proc=${encode_proc%.*}
+    encode_real=${encode_real%.*}

-    config=""
+    load_str="$load_real"
+    encode_str="$encode_real / $encode_proc"

-    if [[ $system_info == *"AVX2 = 1"* ]]; then
-        config="$config AVX2"
-    fi
-
-    if [[ $system_info == *"NEON = 1"* ]]; then
-        config="$config NEON"
-    fi
-
-    if [[ $system_info == *"BLAS = 1"* ]]; then
-        config="$config BLAS"
-    fi
+    config=$(echo "$system_info" | sed 's/ | /\n/g' | tail -n +2 | awk '/ = 1/{print $1}' | tr '\n' ' ')

    commit=$(git rev-parse --short HEAD)

-    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
+    printf "| <todo>  | <todo> | %-16s | %-7s | %-2s | %-5s | %-13s | %-8s |\n" \
+        "$config" \
+        "$model" \
+        "$n_threads" \
+        "$load_str" \
+        "$encode_str" \
+        "$commit"
 done
--- a/ggml.c
+++ b/ggml.c
@ -282,54 +282,83 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {

 #if defined(_MSC_VER) || defined(__MINGW32__)
 static int64_t timer_freq;
+static HANDLE current_process_handle;
 void ggml_time_init(void) {
    LARGE_INTEGER frequency;
    QueryPerformanceFrequency(&frequency);
    timer_freq = frequency.QuadPart;
+
+    current_process_handle = GetCurrentProcess();
 }
-int64_t ggml_time_ms(void) {
+
+int64_t ggml_time_real_ms(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return (t.QuadPart * 1000) / timer_freq;
 }
-int64_t ggml_time_us(void) {
+
+int64_t ggml_time_real_us(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return (t.QuadPart * 1000000) / timer_freq;
 }
+
+// Query only user time for process times as CLOCK_PROCESS_CPUTIME_ID does not include kernel time on Unix systems.
+
+int64_t ggml_time_proc_ms(void) {
+    FILETIME user_time;
+    GetProcessTimes(current_process_handle, NULL, NULL, NULL, &user_time);
+
+    ULARGE_INTEGER t;
+    t.u.LowPart = user_time.dwLowDateTime;
+    t.u.HighPart = user_time.dwHighDateTime;
+
+    return t.QuadPart / 10000;
+}
+
+int64_t ggml_time_proc_us(void) {
+    FILETIME user_time;
+    GetProcessTimes(current_process_handle, NULL, NULL, NULL, &user_time);
+
+    ULARGE_INTEGER t;
+    t.u.LowPart = user_time.dwLowDateTime;
+    t.u.HighPart = user_time.dwHighDateTime;
+
+    return t.QuadPart / 10;
+}
 #else
 void ggml_time_init(void) {}
-int64_t ggml_time_ms(void) {
+int64_t ggml_time_real_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }

-int64_t ggml_time_us(void) {
+int64_t ggml_time_real_us(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 }
-#endif

-int64_t ggml_cycles(void) {
-    return clock();
+int64_t ggml_time_proc_ms(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }

-int64_t ggml_cycles_per_ms(void) {
-    return CLOCKS_PER_SEC/1000;
+int64_t ggml_time_proc_us(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 }
+#endif

 #ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
+#define GGML_PERF_TIME_REAL_US() ggml_time_real_us()
+#define GGML_PERF_TIME_PROC_US() ggml_time_proc_us()
 #else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
+#define GGML_PERF_TIME_REAL_US() 0
+#define GGML_PERF_TIME_PROC_US() 0
 #endif

 //
@ -1477,7 +1506,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    if (is_first_call) {
        // initialize GELU, EXP and F32 tables
        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+            const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start);

            ggml_fp16_t ii;
            for (int i = 0; i < (1 << 16); ++i) {
@ -1488,14 +1517,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                table_exp_f16[i]  = GGML_FP32_TO_FP16(exp(f));
            }

-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+            const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }

        // initialize g_state
        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+            const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start);

            g_state = (struct ggml_state) {
                /*.contexts =*/ { { 0 } },
@ -1505,7 +1534,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                g_state.contexts[i].used = false;
            }

-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+            const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }
@ -1657,8 +1686,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.opt               =*/ { NULL },
        /*.n_tasks           =*/ 0,
        /*.perf_runs         =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
+        /*.perf_time_proc_us =*/ 0,
+        /*.perf_time_real_us =*/ 0,
        /*.data              =*/ data == NULL ? (void *)(result + 1) : data,
        /*.pad               =*/ { 0 },
    };
@ -4307,7 +4336,7 @@ static void ggml_compute_forward_mul_mat_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -4403,7 +4432,7 @@ static void ggml_compute_forward_mul_mat_f32(
            }
        }

-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3);

        return;
    }
@ -4533,7 +4562,7 @@ static void ggml_compute_forward_mul_mat_f32(
        }
    }

-    //int64_t t1 = ggml_perf_time_us();
+    //int64_t t1 = GGML_PERF_TIME_PROC_US();
    //static int64_t acc = 0;
    //acc += t1 - t0;
    //if (t1 - t0 > 10) {
@ -4552,7 +4581,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -4686,7 +4715,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
            }
        }

-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3);

        return;
    }
@ -4840,7 +4869,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
        }
    }

-    //int64_t t1 = ggml_time_us();
+    //int64_t t1 = GGML_PERF_TIME_REAL_US();
    //static int64_t acc = 0;
    //acc += t1 - t0;
    //if (t1 - t0 > 10) {
@ -5308,7 +5337,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -5428,7 +5457,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -5574,7 +5603,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -5694,7 +5723,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int ne00 = src0->ne[0];
@ -5838,7 +5867,7 @@ static void ggml_compute_forward_flash_attn_f32(
        const struct ggml_tensor * v,
        const bool masked,
             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int neq0 = q->ne[0];
@ -6047,7 +6076,7 @@ static void ggml_compute_forward_flash_attn_f16(
        const struct ggml_tensor * v,
        const bool masked,
             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int neq0 = q->ne[0];
@ -6322,7 +6351,7 @@ static void ggml_compute_forward_flash_ff_f16(
        const struct ggml_tensor * c0, // F16 proj_w
        const struct ggml_tensor * c1, // F32 proj_b
        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
    UNUSED(t0);

    const int nea0 = a->ne[0];
@ -7001,8 +7030,8 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
        /*.grads             =*/ { NULL },
        /*.leafs             =*/ { NULL },
        /*.perf_runs         =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
+        /*.perf_time_proc_us =*/ 0,
+        /*.perf_time_real_us =*/ 0,
    };

    ggml_build_forward_impl(&result, tensor, false);
@ -7411,8 +7440,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
        }
    }

-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
+    const int64_t perf_time_proc_start_us = GGML_PERF_TIME_PROC_US();
+    const int64_t perf_time_real_start_us = GGML_PERF_TIME_REAL_US();

    for (int i = 0; i < cgraph->n_nodes; i++) {
        GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
@ -7424,8 +7453,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
        //    continue;
        //}

-        const int64_t perf_node_start_cycles  = ggml_perf_cycles();
-        const int64_t perf_node_start_time_us = ggml_perf_time_us();
+        const int64_t perf_node_time_proc_start_us = GGML_PERF_TIME_PROC_US();
+        const int64_t perf_node_time_real_start_us = GGML_PERF_TIME_REAL_US();

        // INIT
        struct ggml_compute_params params = {
@ -7550,12 +7579,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)

        // performance stats (node)
        {
-            int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;
-            int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
+            int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_node_time_proc_start_us;
+            int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_node_time_real_start_us;

            node->perf_runs++;
-            node->perf_cycles  += perf_cycles_cur;
-            node->perf_time_us += perf_time_us_cur;
+            node->perf_time_proc_us += perf_cur_time_proc_us;
+            node->perf_time_real_us += perf_cur_time_real_us;
        }
    }

@ -7575,19 +7604,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)

    // performance stats (graph)
    {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
+        int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_time_proc_start_us;
+        int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_time_real_start_us;

        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
+        cgraph->perf_time_proc_us += perf_cur_time_proc_us;
+        cgraph->perf_time_real_us += perf_cur_time_real_us;

        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
                __func__, cgraph->perf_runs,
-                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
-                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
-                (double) perf_time_us_cur     / 1000.0,
-                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
+                (double) perf_cur_time_proc_us     / 1000.0,
+                (double) cgraph->perf_time_proc_us / 1000.0 / (double) cgraph->perf_runs,
+                (double) perf_cur_time_real_us     / 1000.0,
+                (double) cgraph->perf_time_real_us / 1000.0 / (double) cgraph->perf_runs);
    }
 }

@ -7613,16 +7642,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];

-        perf_total_per_op_us[node->op] += node->perf_time_us;
+        perf_total_per_op_us[node->op] += node->perf_time_real_us;

        GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                i,
                node->ne[0], node->ne[1], node->ne[2],
                GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
-                (double) node->perf_time_us / 1000.0,
-                (double) node->perf_time_us / 1000.0 / node->perf_runs);
+                (double) node->perf_time_proc_us / 1000.0,
+                (double) node->perf_time_proc_us / 1000.0 / (double) node->perf_runs,
+                (double) node->perf_time_real_us / 1000.0,
+                (double) node->perf_time_real_us / 1000.0 / node->perf_runs);
    }

    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
@ -7901,10 +7930,10 @@ static enum ggml_opt_result ggml_opt_adam(
                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
        }

-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
+        const int64_t t_real_start_us    = GGML_PERF_TIME_REAL_US();
+        const int64_t t_process_start_us = GGML_PERF_TIME_PROC_US();
+        UNUSED(t_real_start_us);
+        UNUSED(t_process_start_us);

        {
            // update the gradient
@ -7984,13 +8013,13 @@ static enum ggml_opt_result ggml_opt_adam(
        fx_prev = fx;

        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
+            const int64_t t_process_end_us = GGML_PERF_TIME_PROC_US();
+            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", (t_process_end_us - t_process_start_us)/1e6);
+            UNUSED(t_process_end_us);

-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
+            const int64_t t_real_end_us = GGML_PERF_TIME_REAL_US();
+            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_real_end_us - t_real_start_us)/1e6);
+            UNUSED(t_real_end_us);
        }
    }

--- a/ggml.h
+++ b/ggml.h
@ -275,8 +275,8 @@ struct ggml_tensor {

    // performance
    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
+    int64_t perf_time_proc_us;
+    int64_t perf_time_real_us;

    void * data;
    char padding[8];
@ -297,8 +297,8 @@ struct ggml_cgraph {

    // performance
    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
+    int64_t perf_time_proc_us;
+    int64_t perf_time_real_us;
 };

 struct ggml_init_params {
@ -308,10 +308,10 @@ struct ggml_init_params {
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
-int64_t ggml_cycles(void);
-int64_t ggml_cycles_per_ms(void);
+int64_t ggml_time_real_ms(void);
+int64_t ggml_time_real_us(void);
+int64_t ggml_time_proc_ms(void);
+int64_t ggml_time_proc_us(void);

 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);
--- a/whisper.cpp
+++ b/whisper.cpp
@ -467,12 +467,18 @@ struct whisper_decoder {
 };

 struct whisper_context {
-    int64_t t_load_us   = 0;
-    int64_t t_mel_us    = 0;
-    int64_t t_sample_us = 0;
-    int64_t t_encode_us = 0;
-    int64_t t_decode_us = 0;
-    int64_t t_start_us  = 0;
+    int64_t t_load_real_us   = 0;
+    int64_t t_load_proc_us   = 0;
+    int64_t t_mel_real_us    = 0;
+    int64_t t_mel_proc_us    = 0;
+    int64_t t_sample_real_us = 0;
+    int64_t t_sample_proc_us = 0;
+    int64_t t_encode_real_us = 0;
+    int64_t t_encode_proc_us = 0;
+    int64_t t_decode_real_us = 0;
+    int64_t t_decode_proc_us = 0;
+    int64_t t_start_real_us  = 0;
+    int64_t t_start_proc_us  = 0;

    ggml_type wtype; // weight type (FP32 or FP16)

@ -597,9 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
 static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
    fprintf(stderr, "%s: loading model\n", __func__);

-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();

-    wctx.t_start_us = t_start_us;
+    wctx.t_start_real_us = t_start_real_us;
+    wctx.t_start_proc_us = t_start_proc_us;

    auto & model = wctx.model;
    auto & vocab = wctx.vocab;
@ -1208,7 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

    wctx.rng = std::mt19937(0);

-    wctx.t_load_us = ggml_time_us() - t_start_us;
+    wctx.t_load_real_us = ggml_time_real_us() - t_start_real_us;
+    wctx.t_load_proc_us = ggml_time_proc_us() - t_start_proc_us;

    return true;
 }
@ -1226,7 +1235,8 @@ static bool whisper_encode(
        whisper_context & wctx,
              const int   mel_offset,
              const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();

    const auto & model   = wctx.model;
    const auto & mel_inp = wctx.mel;
@ -1619,7 +1629,8 @@ static bool whisper_encode(

    ggml_free(ctx0);

-    wctx.t_encode_us += ggml_time_us() - t_start_us;
+    wctx.t_encode_real_us += ggml_time_real_us() - t_start_real_us;
+    wctx.t_encode_proc_us += ggml_time_proc_us() - t_start_proc_us;

    return true;
 }
@ -1641,7 +1652,8 @@ static bool whisper_decode(
              const int   n_tokens,
              const int   n_past,
              const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();

    const auto & model   = wctx.model;
    const auto & hparams = model.hparams;
@ -1992,7 +2004,8 @@ static bool whisper_decode(

    ggml_free(ctx0);

-    wctx.t_decode_us += ggml_time_us() - t_start_us;
+    wctx.t_decode_real_us += ggml_time_real_us() - t_start_real_us;
+    wctx.t_decode_proc_us += ggml_time_proc_us() - t_start_proc_us;

    return true;
 }
@ -2107,7 +2120,8 @@ static bool log_mel_spectrogram(
  const whisper_filters & filters,
             const bool   speed_up,
            whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();

    // Hanning window
    std::vector<float> hann;
@ -2216,7 +2230,8 @@ static bool log_mel_spectrogram(
        mel.data[i] = (mel.data[i] + 4.0)/4.0;
    }

-    wctx.t_mel_us += ggml_time_us() - t_start_us;
+    wctx.t_mel_real_us = ggml_time_real_us() - t_start_real_us;
+    wctx.t_mel_proc_us = ggml_time_proc_us() - t_start_proc_us;

    return true;
 }
@ -2642,21 +2657,25 @@ whisper_token whisper_token_transcribe(void) {
 }

 void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
+    const int64_t t_real_end_us = ggml_time_real_us();
+    const int64_t t_proc_end_us = ggml_time_proc_us();

    fprintf(stderr, "\n");
-    fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
-    fprintf(stderr, "%s:      mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
-    fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
-    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
-    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
-    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+    fprintf(stderr, "%s:     load time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_load_real_us/1000.0f,   ctx->t_load_proc_us/1000.0f);
+    fprintf(stderr, "%s:      mel time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_mel_real_us/1000.0f,    ctx->t_mel_proc_us/1000.0f);
+    fprintf(stderr, "%s:   sample time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_sample_real_us/1000.0f, ctx->t_sample_proc_us/1000.0f);
+    fprintf(stderr, "%s:   encode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_encode_real_us/1000.0f, ctx->t_encode_proc_us/1000.0f);
+    fprintf(stderr, "%s:   decode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_decode_real_us/1000.0f, ctx->t_decode_proc_us/1000.0f);
+    fprintf(stderr, "%s:    total time = %8.2f ms (proc %8.2f ms)\n", __func__, (t_real_end_us - ctx->t_start_real_us)/1000.f, (t_proc_end_us - ctx->t_start_proc_us)/1000.0f);
 }

 void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_sample_us = 0;
-    ctx->t_encode_us = 0;
-    ctx->t_decode_us = 0;
+    ctx->t_sample_real_us = 0;
+    ctx->t_sample_proc_us = 0;
+    ctx->t_encode_real_us = 0;
+    ctx->t_encode_proc_us = 0;
+    ctx->t_decode_real_us = 0;
+    ctx->t_decode_proc_us = 0;
 }

 const char * whisper_print_system_info(void) {
@ -3455,7 +3474,8 @@ int whisper_full(
                }

                {
-                    const int64_t t_start_sample_us = ggml_time_us();
+                    const int64_t t_start_real_us = ggml_time_real_us();
+                    const int64_t t_start_proc_us = ggml_time_proc_us();

                    whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur);

@ -3474,12 +3494,14 @@ int whisper_full(
                        memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                    }

-                    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                    ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;
                }
            }

            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
+                const int64_t t_start_real_us = ggml_time_real_us();
+                const int64_t t_start_proc_us = ggml_time_proc_us();

                // store the KV caches of all decoders when doing beam-search
                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@ -3672,7 +3694,8 @@ int whisper_full(
                    }
                }

-                ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;

                // obtain logits for the next token
                for (int j = 0; j < n_decoders_cur; ++j) {
@ -3693,13 +3716,15 @@ int whisper_full(
                    }

                    {
-                        const int64_t t_start_sample_us = ggml_time_us();
+                        const int64_t t_start_real_us = ggml_time_real_us();
+                        const int64_t t_start_proc_us = ggml_time_proc_us();

                        whisper_process_logits(*ctx, params, decoder, t_cur);

                        ++decoder.kv_self.n;

-                        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                        ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                        ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;
                    }
                }
            }
@ -3993,10 +4018,14 @@ int whisper_full_parallel(
            }
        }

-        ctx->t_mel_us    += ctxs[i].t_mel_us;
-        ctx->t_sample_us += ctxs[i].t_sample_us;
-        ctx->t_encode_us += ctxs[i].t_encode_us;
-        ctx->t_decode_us += ctxs[i].t_decode_us;
+        ctx->t_mel_real_us    += ctxs[i].t_mel_real_us;
+        ctx->t_mel_proc_us    += ctxs[i].t_mel_proc_us;
+        ctx->t_sample_real_us += ctxs[i].t_sample_real_us;
+        ctx->t_sample_proc_us += ctxs[i].t_sample_proc_us;
+        ctx->t_encode_real_us += ctxs[i].t_encode_real_us;
+        ctx->t_encode_proc_us += ctxs[i].t_encode_proc_us;
+        ctx->t_decode_real_us += ctxs[i].t_decode_real_us;
+        ctx->t_decode_proc_us += ctxs[i].t_decode_proc_us;

        kv_cache_free(ctx->kv_cross);

@ -4006,10 +4035,14 @@ int whisper_full_parallel(
    }

    // average the timings
-    ctx->t_mel_us    /= n_processors;
-    ctx->t_sample_us /= n_processors;
-    ctx->t_encode_us /= n_processors;
-    ctx->t_decode_us /= n_processors;
+    ctx->t_mel_real_us    /= n_processors;
+    ctx->t_mel_proc_us    /= n_processors;
+    ctx->t_sample_real_us /= n_processors;
+    ctx->t_sample_proc_us /= n_processors;
+    ctx->t_encode_real_us /= n_processors;
+    ctx->t_encode_proc_us /= n_processors;
+    ctx->t_decode_real_us /= n_processors;
+    ctx->t_decode_proc_us /= n_processors;

    // print information about the audio boundaries
    fprintf(stderr, "\n");