From d573de4f94f26dda7316d91ed79e1622707264d9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 15 Jan 2023 12:27:19 +0200
Subject: [PATCH] Cosmetic stuff

---
 extra/bench-all.sh |  54 +++++++--------
 ggml.c             | 162 ++++++++++++++++++++++-----------------------
 ggml.h             |  16 ++---
 whisper.cpp        | 148 ++++++++++++++++++++---------------------
 4 files changed, 189 insertions(+), 191 deletions(-)

diff --git a/extra/bench-all.sh b/extra/bench-all.sh
index b038894..7c0325c 100755
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@@ -19,22 +19,23 @@ printf "\n"
 
 cat >&2 << EOF
 How to interpret these results:
-- CPU is your CPU model.
-- OS is your current operating system.
-- Model is the GGML model being benchmarked.
-- Threads is the number of threads used.
-- Total Load is the total amount of CPU time whisper took to load the model.
-- Real Load is the real amount of time your computer took to load the model.
-- Total Encode is the total amount of CPU time whisper took to encode the input.
-  If you're using multiple threads, the time spent in each thread will be added together.
-- Real Encode is the real amount of time your computer took to encode the input.
-  This should be approximately (Total Encode / Threads). If it isn't, you likely have another program making use of the CPU.
+- CPU is your CPU model
+- OS is your current operating system
+- Model is the GGML model being benchmarked
+- Threads is the number of threads used
+- Load is the time your computer took to load the model
+- Encode is the time it took to run the Whisper encoder
+- Time is reported as (real / process):
+  real:    This is the wall-clock time in ms
+  process: This the CPU time. If you're using multiple threads, the time spent in each thread will be added together.
+           The proces time should be approximately (Proc Enc. / Threads).
+           If it isn't, you likely have another program making use of the CPU
 - Commit is the current git commit.
 
 EOF
 
-printf "|   CPU   |   OS   |      Config      |  Model  | Threads | Total Load | Real Load | Total Encode | Real Encode |  Commit  |\n"
-printf "| ------- | ------ | ---------------- | ------- | ------- | ---------- | --------- | ------------ | ----------- | -------- |\n"
+printf "|   CPU   |   OS   |      Config      |  Model  | Threads | Load  | Encode        |  Commit  |\n"
+printf "| ------- | ------ | ---------------- | ------- | ------- | ----- | ------------- | -------- |\n"
 
 for model in "${models[@]}"; do
     # run once to heat-up the cache
@@ -45,31 +46,32 @@ for model in "${models[@]}"; do
     output=$(./bench -m "./models/ggml-$model.bin" -t "$n_threads" 2>&1)
 
     # parse the output:
-    total_load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
-    real_load_time=$(echo "$output" | grep "load time" | awk '{print $8}')
-    total_encode_time=$(echo "$output" | grep "encode time" | awk '{print $5}')
-    real_encode_time=$(echo "$output" | grep "encode time" | awk '{print $8}')
+    load_proc=$(echo   "$output" | grep "load time"   | awk '{print $8}')
+    load_real=$(echo   "$output" | grep "load time"   | awk '{print $5}')
+    encode_proc=$(echo "$output" | grep "encode time" | awk '{print $8}')
+    encode_real=$(echo "$output" | grep "encode time" | awk '{print $5}')
     system_info=$(echo "$output" | grep "system_info")
-    n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
+    n_threads=$(echo   "$output" | grep "system_info" | awk '{print $4}')
 
     # floor to milliseconds
-    total_load_time=${total_load_time%.*}
-    real_load_time=${real_load_time%.*}
-    total_encode_time=${total_encode_time%.*}
-    real_encode_time=${real_encode_time%.*}
+    load_proc=${load_proc%.*}
+    load_real=${load_real%.*}
+    encode_proc=${encode_proc%.*}
+    encode_real=${encode_real%.*}
+
+    load_str="$load_real"
+    encode_str="$encode_real / $encode_proc"
 
     config=$(echo "$system_info" | sed 's/ | /\n/g' | tail -n +2 | awk '/ = 1/{print $1}' | tr '\n' ' ')
 
     commit=$(git rev-parse --short HEAD)
 
-    printf "| <todo>  | <todo> | %-16s | %-7s | %-7s | %-10s | %-9s | %-12s | %-11s | %-8s |\n" \
+    printf "| <todo>  | <todo> | %-16s | %-7s | %-7s | %-5s | %-13s | %-8s |\n" \
         "$config" \
         "$model" \
         "$n_threads" \
-        "$total_load_time" \
-        "$real_load_time" \
-        "$total_encode_time" \
-        "$real_encode_time" \
+        "$load_str" \
+        "$encode_str" \
         "$commit"
 done
 
diff --git a/ggml.c b/ggml.c
index da0313f..12a0dd2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -288,46 +288,46 @@ void ggml_time_init(void) {
     timer_freq = frequency.QuadPart;
 }
 
-int64_t ggml_real_time_ms(void) {
+int64_t ggml_time_real_ms(void) {
     LARGE_INTEGER t;
     QueryPerformanceCounter(&t);
     return (t.QuadPart * 1000) / timer_freq;
 }
 
-int64_t ggml_real_time_us(void) {
+int64_t ggml_time_real_us(void) {
     LARGE_INTEGER t;
     QueryPerformanceCounter(&t);
     return (t.QuadPart * 1000000) / timer_freq;
 }
 
-int64_t ggml_process_time_ms(void) {
+int64_t ggml_time_proc_ms(void) {
     return (clock() * 1000) / CLOCKS_PER_SEC;
 }
 
-int64_t ggml_process_time_us(void) {
+int64_t ggml_time_proc_us(void) {
     return (clock() * 1000000) / CLOCKS_PER_SEC;
 }
 #else
 void ggml_time_init(void) {}
-int64_t ggml_real_time_ms(void) {
+int64_t ggml_time_real_ms(void) {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }
 
-int64_t ggml_real_time_us(void) {
+int64_t ggml_time_real_us(void) {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 }
 
-int64_t ggml_process_time_ms(void) {
+int64_t ggml_time_proc_ms(void) {
     struct timespec ts;
     clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }
 
-int64_t ggml_process_time_us(void) {
+int64_t ggml_time_proc_us(void) {
     struct timespec ts;
     clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
@@ -335,11 +335,11 @@ int64_t ggml_process_time_us(void) {
 #endif
 
 #ifdef GGML_PERF
-#define ggml_perf_real_time_us()    ggml_real_time_us()
-#define ggml_perf_process_time_us() ggml_process_time_us()
+#define GGML_PERF_TIME_REAL_US() ggml_time_real_us()
+#define GGML_PERF_TIME_PROC_US() ggml_time_proc_us()
 #else
-#define ggml_perf_real_time_us()    0
-#define ggml_perf_process_time_us() 0
+#define GGML_PERF_TIME_REAL_US() 0
+#define GGML_PERF_TIME_PROC_US() 0
 #endif
 
 //
@@ -1487,7 +1487,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     if (is_first_call) {
         // initialize GELU, EXP and F32 tables
         {
-            const uint64_t t_start = ggml_perf_real_time_us(); UNUSED(t_start);
+            const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start);
 
             ggml_fp16_t ii;
             for (int i = 0; i < (1 << 16); ++i) {
@@ -1498,14 +1498,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                 table_exp_f16[i]  = GGML_FP32_TO_FP16(exp(f));
             }
 
-            const uint64_t t_end = ggml_perf_real_time_us(); UNUSED(t_end);
+            const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end);
 
             GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
         // initialize g_state
         {
-            const uint64_t t_start = ggml_perf_real_time_us(); UNUSED(t_start);
+            const uint64_t t_start = GGML_PERF_TIME_REAL_US(); UNUSED(t_start);
 
             g_state = (struct ggml_state) {
                 /*.contexts =*/ { { 0 } },
@@ -1515,7 +1515,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                 g_state.contexts[i].used = false;
             }
 
-            const uint64_t t_end = ggml_perf_real_time_us(); UNUSED(t_end);
+            const uint64_t t_end = GGML_PERF_TIME_REAL_US(); UNUSED(t_end);
 
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
@@ -1655,22 +1655,22 @@ struct ggml_tensor * ggml_new_tensor_impl(
     ggml_assert_aligned(result);
 
     *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.n_dims       =*/ n_dims,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.is_param     =*/ false,
-        /*.grad         =*/ NULL,
-        /*.src0         =*/ NULL,
-        /*.src1         =*/ NULL,
-        /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
-        /*.perf_runs    =*/ 0,
-        /*.perf_process_time_us  =*/ 0,
-        /*.perf_real_time_us =*/ 0,
-        /*.data         =*/ data == NULL ? (void *)(result + 1) : data,
-        /*.pad          =*/ { 0 },
+        /*.type              =*/ type,
+        /*.n_dims            =*/ n_dims,
+        /*.ne                =*/ { 1, 1, 1, 1 },
+        /*.nb                =*/ { 0, 0, 0, 0 },
+        /*.op                =*/ GGML_OP_NONE,
+        /*.is_param          =*/ false,
+        /*.grad              =*/ NULL,
+        /*.src0              =*/ NULL,
+        /*.src1              =*/ NULL,
+        /*.opt               =*/ { NULL },
+        /*.n_tasks           =*/ 0,
+        /*.perf_runs         =*/ 0,
+        /*.perf_time_proc_us =*/ 0,
+        /*.perf_time_real_us =*/ 0,
+        /*.data              =*/ data == NULL ? (void *)(result + 1) : data,
+        /*.pad               =*/ { 0 },
     };
 
     ggml_assert_aligned(result->data);
@@ -4317,7 +4317,7 @@ static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -4411,7 +4411,7 @@ static void ggml_compute_forward_mul_mat_f32(
             }
         }
 
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
     }
@@ -4541,7 +4541,7 @@ static void ggml_compute_forward_mul_mat_f32(
         }
     }
 
-    //int64_t t1 = ggml_perf_process_time_us();
+    //int64_t t1 = GGML_PERF_TIME_PROC_US();
     //static int64_t acc = 0;
     //acc += t1 - t0;
     //if (t1 - t0 > 10) {
@@ -4560,7 +4560,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -4692,7 +4692,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
             }
         }
 
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (GGML_PERF_TIME_PROC_US() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
     }
@@ -4846,7 +4846,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         }
     }
 
-    //int64_t t1 = ggml_perf_real_time_us();
+    //int64_t t1 = GGML_PERF_TIME_REAL_US();
     //static int64_t acc = 0;
     //acc += t1 - t0;
     //if (t1 - t0 > 10) {
@@ -5314,7 +5314,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -5434,7 +5434,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -5580,7 +5580,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -5700,7 +5700,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int ne00 = src0->ne[0];
@@ -5844,7 +5844,7 @@ static void ggml_compute_forward_flash_attn_f32(
         const struct ggml_tensor * v,
         const bool masked,
              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int neq0 = q->ne[0];
@@ -6053,7 +6053,7 @@ static void ggml_compute_forward_flash_attn_f16(
         const struct ggml_tensor * v,
         const bool masked,
              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int neq0 = q->ne[0];
@@ -6328,7 +6328,7 @@ static void ggml_compute_forward_flash_ff_f16(
         const struct ggml_tensor * c0, // F16 proj_w
         const struct ggml_tensor * c1, // F32 proj_b
         struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_process_time_us();
+    int64_t t0 = GGML_PERF_TIME_PROC_US();
     UNUSED(t0);
 
     const int nea0 = a->ne[0];
@@ -6998,17 +6998,17 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
 
 struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ 0,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
-        /*.nodes        =*/ { NULL },
-        /*.grads        =*/ { NULL },
-        /*.leafs        =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_process_time_us  =*/ 0,
-        /*.perf_real_time_us =*/ 0,
+        /*.n_nodes           =*/ 0,
+        /*.n_leafs           =*/ 0,
+        /*.n_threads         =*/ 0,
+        /*.work_size         =*/ 0,
+        /*.work              =*/ NULL,
+        /*.nodes             =*/ { NULL },
+        /*.grads             =*/ { NULL },
+        /*.leafs             =*/ { NULL },
+        /*.perf_runs         =*/ 0,
+        /*.perf_time_proc_us =*/ 0,
+        /*.perf_time_real_us =*/ 0,
     };
 
     ggml_build_forward_impl(&result, tensor, false);
@@ -7414,8 +7414,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
     }
 
-    const int64_t perf_process_time_start_us  = ggml_perf_process_time_us();
-    const int64_t perf_real_time_start_us = ggml_perf_real_time_us();
+    const int64_t perf_time_proc_start_us = GGML_PERF_TIME_PROC_US();
+    const int64_t perf_time_real_start_us = GGML_PERF_TIME_REAL_US();
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
@@ -7427,8 +7427,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         //    continue;
         //}
 
-        const int64_t perf_node_process_time_start_us  = ggml_perf_process_time_us();
-        const int64_t perf_node_real_time_start_us = ggml_perf_real_time_us();
+        const int64_t perf_node_time_proc_start_us = GGML_PERF_TIME_PROC_US();
+        const int64_t perf_node_time_real_start_us = GGML_PERF_TIME_REAL_US();
 
         // INIT
         struct ggml_compute_params params = {
@@ -7553,12 +7553,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // performance stats (node)
         {
-            int64_t perf_cur_process_time_us  = ggml_perf_process_time_us()  - perf_node_process_time_start_us;
-            int64_t perf_cur_real_time_us = ggml_perf_real_time_us() - perf_node_real_time_start_us;
+            int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_node_time_proc_start_us;
+            int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_node_time_real_start_us;
 
             node->perf_runs++;
-            node->perf_process_time_us  += perf_cur_process_time_us;
-            node->perf_real_time_us += perf_cur_real_time_us;
+            node->perf_time_proc_us += perf_cur_time_proc_us;
+            node->perf_time_real_us += perf_cur_time_real_us;
         }
     }
 
@@ -7578,19 +7578,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     // performance stats (graph)
     {
-        int64_t perf_cur_process_time_us  = ggml_perf_process_time_us()  - perf_process_time_start_us;
-        int64_t perf_cur_real_time_us = ggml_perf_real_time_us() - perf_real_time_start_us;
+        int64_t perf_cur_time_proc_us = GGML_PERF_TIME_PROC_US() - perf_time_proc_start_us;
+        int64_t perf_cur_time_real_us = GGML_PERF_TIME_REAL_US() - perf_time_real_start_us;
 
         cgraph->perf_runs++;
-        cgraph->perf_process_time_us  += perf_cur_process_time_us;
-        cgraph->perf_real_time_us += perf_cur_real_time_us;
+        cgraph->perf_time_proc_us += perf_cur_time_proc_us;
+        cgraph->perf_time_real_us += perf_cur_time_real_us;
 
         GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
                 __func__, cgraph->perf_runs,
-                (double) perf_cur_process_time_us      / 1000.0,
-                (double) cgraph->perf_process_time_us  / 1000.0 / (double) cgraph->perf_runs,
-                (double) perf_cur_real_time_us     / 1000.0,
-                (double) cgraph->perf_real_time_us / 1000.0 / (double) cgraph->perf_runs);
+                (double) perf_cur_time_proc_us     / 1000.0,
+                (double) cgraph->perf_time_proc_us / 1000.0 / (double) cgraph->perf_runs,
+                (double) perf_cur_time_real_us     / 1000.0,
+                (double) cgraph->perf_time_real_us / 1000.0 / (double) cgraph->perf_runs);
     }
 }
 
@@ -7616,16 +7616,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
 
-        perf_total_per_op_us[node->op] += node->perf_real_time_us;
+        perf_total_per_op_us[node->op] += node->perf_time_real_us;
 
         GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
                 GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_process_time_us  / 1000.0,
-                (double) node->perf_process_time_us  / 1000.0 / (double) node->perf_runs,
-                (double) node->perf_real_time_us / 1000.0,
-                (double) node->perf_real_time_us / 1000.0 / node->perf_runs);
+                (double) node->perf_time_proc_us / 1000.0,
+                (double) node->perf_time_proc_us / 1000.0 / (double) node->perf_runs,
+                (double) node->perf_time_real_us / 1000.0,
+                (double) node->perf_time_real_us / 1000.0 / node->perf_runs);
     }
 
     GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
@@ -7904,8 +7904,8 @@ static enum ggml_opt_result ggml_opt_adam(
                     ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
         }
 
-        const int64_t t_real_start_us = ggml_perf_real_time_us();
-        const int64_t t_process_start_us = ggml_perf_process_time_us();
+        const int64_t t_real_start_us    = GGML_PERF_TIME_REAL_US();
+        const int64_t t_process_start_us = GGML_PERF_TIME_PROC_US();
         UNUSED(t_real_start_us);
         UNUSED(t_process_start_us);
 
@@ -7987,11 +7987,11 @@ static enum ggml_opt_result ggml_opt_adam(
         fx_prev = fx;
 
         {
-            const int64_t t_process_end_us = ggml_perf_process_time_us();
+            const int64_t t_process_end_us = GGML_PERF_TIME_PROC_US();
             GGML_PRINT_DEBUG("time iter:      %5.3f s\n", (t_process_end_us - t_process_start_us)/1e6);
             UNUSED(t_process_end_us);
 
-            const int64_t t_real_end_us = ggml_perf_real_time_us();
+            const int64_t t_real_end_us = GGML_PERF_TIME_REAL_US();
             GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_real_end_us - t_real_start_us)/1e6);
             UNUSED(t_real_end_us);
         }
diff --git a/ggml.h b/ggml.h
index f8febe6..bc3d673 100644
--- a/ggml.h
+++ b/ggml.h
@@ -275,8 +275,8 @@ struct ggml_tensor {
 
     // performance
     int     perf_runs;
-    int64_t perf_process_time_us;
-    int64_t perf_real_time_us;
+    int64_t perf_time_proc_us;
+    int64_t perf_time_real_us;
 
     void * data;
     char padding[8];
@@ -297,8 +297,8 @@ struct ggml_cgraph {
 
     // performance
     int     perf_runs;
-    int64_t perf_process_time_us;
-    int64_t perf_real_time_us;
+    int64_t perf_time_proc_us;
+    int64_t perf_time_real_us;
 };
 
 struct ggml_init_params {
@@ -308,10 +308,10 @@ struct ggml_init_params {
 };
 
 void    ggml_time_init(void); // call this once at the beginning of the program
-int64_t ggml_real_time_ms(void);
-int64_t ggml_real_time_us(void);
-int64_t ggml_process_time_ms(void);
-int64_t ggml_process_time_us(void);
+int64_t ggml_time_real_ms(void);
+int64_t ggml_time_real_us(void);
+int64_t ggml_time_proc_ms(void);
+int64_t ggml_time_proc_us(void);
 
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);
diff --git a/whisper.cpp b/whisper.cpp
index 034b12c..57731e4 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -467,18 +467,18 @@ struct whisper_decoder {
 };
 
 struct whisper_context {
-    int64_t t_real_load_us      = 0;
-    int64_t t_process_load_us   = 0;
-    int64_t t_real_mel_us       = 0;
-    int64_t t_process_mel_us    = 0;
-    int64_t t_real_sample_us    = 0;
-    int64_t t_process_sample_us = 0;
-    int64_t t_real_encode_us    = 0;
-    int64_t t_process_encode_us = 0;
-    int64_t t_real_decode_us    = 0;
-    int64_t t_process_decode_us = 0;
-    int64_t t_real_start_us     = 0;
-    int64_t t_process_start_us  = 0;
+    int64_t t_load_real_us   = 0;
+    int64_t t_load_proc_us   = 0;
+    int64_t t_mel_real_us    = 0;
+    int64_t t_mel_proc_us    = 0;
+    int64_t t_sample_real_us = 0;
+    int64_t t_sample_proc_us = 0;
+    int64_t t_encode_real_us = 0;
+    int64_t t_encode_proc_us = 0;
+    int64_t t_decode_real_us = 0;
+    int64_t t_decode_proc_us = 0;
+    int64_t t_start_real_us  = 0;
+    int64_t t_start_proc_us  = 0;
 
     ggml_type wtype; // weight type (FP32 or FP16)
 
@@ -603,11 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
 static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
     fprintf(stderr, "%s: loading model\n", __func__);
 
-    const int64_t t_real_start_us = ggml_real_time_us();
-    const int64_t t_process_start_us = ggml_process_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();
 
-    wctx.t_real_start_us = t_real_start_us;
-    wctx.t_process_start_us = t_process_start_us;
+    wctx.t_start_real_us = t_start_real_us;
+    wctx.t_start_proc_us = t_start_proc_us;
 
     auto & model = wctx.model;
     auto & vocab = wctx.vocab;
@@ -1216,8 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     wctx.rng = std::mt19937(0);
 
-    wctx.t_real_load_us = ggml_real_time_us() - t_real_start_us;
-    wctx.t_process_load_us = ggml_process_time_us() - t_process_start_us;
+    wctx.t_load_real_us = ggml_time_real_us() - t_start_real_us;
+    wctx.t_load_proc_us = ggml_time_proc_us() - t_start_proc_us;
 
     return true;
 }
@@ -1235,8 +1235,8 @@ static bool whisper_encode(
         whisper_context & wctx,
               const int   mel_offset,
               const int   n_threads) {
-    const int64_t t_real_start_us = ggml_real_time_us();
-    const int64_t t_process_start_us = ggml_process_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();
 
     const auto & model   = wctx.model;
     const auto & mel_inp = wctx.mel;
@@ -1629,8 +1629,8 @@ static bool whisper_encode(
 
     ggml_free(ctx0);
 
-    wctx.t_real_encode_us += ggml_real_time_us() - t_real_start_us;
-    wctx.t_process_encode_us += ggml_process_time_us() - t_process_start_us;
+    wctx.t_encode_real_us += ggml_time_real_us() - t_start_real_us;
+    wctx.t_encode_proc_us += ggml_time_proc_us() - t_start_proc_us;
 
     return true;
 }
@@ -1652,8 +1652,8 @@ static bool whisper_decode(
               const int   n_tokens,
               const int   n_past,
               const int   n_threads) {
-    const int64_t t_real_start_us = ggml_real_time_us();
-    const int64_t t_process_start_us = ggml_process_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();
 
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
@@ -2004,8 +2004,8 @@ static bool whisper_decode(
 
     ggml_free(ctx0);
 
-    wctx.t_real_decode_us += ggml_real_time_us() - t_real_start_us;
-    wctx.t_process_decode_us += ggml_process_time_us() - t_process_start_us;
+    wctx.t_decode_real_us += ggml_time_real_us() - t_start_real_us;
+    wctx.t_decode_proc_us += ggml_time_proc_us() - t_start_proc_us;
 
     return true;
 }
@@ -2120,8 +2120,8 @@ static bool log_mel_spectrogram(
   const whisper_filters & filters,
              const bool   speed_up,
             whisper_mel & mel) {
-    const int64_t t_real_start_us = ggml_real_time_us();
-    const int64_t t_process_start_us = ggml_process_time_us();
+    const int64_t t_start_real_us = ggml_time_real_us();
+    const int64_t t_start_proc_us = ggml_time_proc_us();
 
     // Hanning window
     std::vector<float> hann;
@@ -2230,8 +2230,8 @@ static bool log_mel_spectrogram(
         mel.data[i] = (mel.data[i] + 4.0)/4.0;
     }
 
-    wctx.t_real_mel_us = ggml_real_time_us() - t_real_start_us;
-    wctx.t_process_mel_us = ggml_process_time_us() - t_process_start_us;
+    wctx.t_mel_real_us = ggml_time_real_us() - t_start_real_us;
+    wctx.t_mel_proc_us = ggml_time_proc_us() - t_start_proc_us;
 
     return true;
 }
@@ -2657,29 +2657,25 @@ whisper_token whisper_token_transcribe(void) {
 }
 
 void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_real_end_us = ggml_real_time_us();
-    const int64_t t_process_end_us = ggml_process_time_us();
+    const int64_t t_real_end_us = ggml_time_real_us();
+    const int64_t t_proc_end_us = ggml_time_proc_us();
 
     fprintf(stderr, "\n");
-    fprintf(stderr, "%s:     load time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_load_us/1000.0f, ctx->t_real_load_us/1000.0f);
-    fprintf(stderr, "%s:      mel time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_mel_us/1000.0f, ctx->t_real_mel_us/1000.0f);
-    fprintf(stderr, "%s:   sample time = %8.2f ms (real %8.2f ms)\n", __func__, ctx->t_process_sample_us/1000.0f, ctx->t_real_sample_us/1000.0f);
-    fprintf(stderr, "%s:   encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
-        ctx->t_process_encode_us/1000.0f, ctx->t_process_encode_us/1000.0f/ctx->model.hparams.n_audio_layer,
-        ctx->t_real_encode_us/1000.0f, ctx->t_real_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
-    fprintf(stderr, "%s:   decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer)\n", __func__,
-        ctx->t_process_decode_us/1000.0f, ctx->t_process_decode_us/1000.0f/ctx->model.hparams.n_text_layer,
-        ctx->t_real_decode_us/1000.0f, ctx->t_real_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
-    fprintf(stderr, "%s:    total time = %8.2f ms (real %8.2f ms)\n", __func__, (t_process_end_us - ctx->t_process_start_us)/1000.f, (t_real_end_us - ctx->t_real_start_us)/1000.0f);
+    fprintf(stderr, "%s:     load time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_load_real_us/1000.0f,   ctx->t_load_proc_us/1000.0f);
+    fprintf(stderr, "%s:      mel time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_mel_real_us/1000.0f,    ctx->t_mel_proc_us/1000.0f);
+    fprintf(stderr, "%s:   sample time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_sample_real_us/1000.0f, ctx->t_sample_proc_us/1000.0f);
+    fprintf(stderr, "%s:   encode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_encode_real_us/1000.0f, ctx->t_encode_proc_us/1000.0f);
+    fprintf(stderr, "%s:   decode time = %8.2f ms (proc %8.2f ms)\n", __func__, ctx->t_decode_real_us/1000.0f, ctx->t_decode_proc_us/1000.0f);
+    fprintf(stderr, "%s:    total time = %8.2f ms (proc %8.2f ms)\n", __func__, (t_real_end_us - ctx->t_start_real_us)/1000.f, (t_proc_end_us - ctx->t_start_proc_us)/1000.0f);
 }
 
 void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_real_sample_us = 0;
-    ctx->t_process_sample_us = 0;
-    ctx->t_real_encode_us = 0;
-    ctx->t_process_encode_us = 0;
-    ctx->t_real_decode_us = 0;
-    ctx->t_process_decode_us = 0;
+    ctx->t_sample_real_us = 0;
+    ctx->t_sample_proc_us = 0;
+    ctx->t_encode_real_us = 0;
+    ctx->t_encode_proc_us = 0;
+    ctx->t_decode_real_us = 0;
+    ctx->t_decode_proc_us = 0;
 }
 
 const char * whisper_print_system_info(void) {
@@ -3478,8 +3474,8 @@ int whisper_full(
                 }
 
                 {
-                    const int64_t t_real_start_sample_us = ggml_real_time_us();
-                    const int64_t t_process_start_sample_us = ggml_process_time_us();
+                    const int64_t t_start_real_us = ggml_time_real_us();
+                    const int64_t t_start_proc_us = ggml_time_proc_us();
 
                     whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur);
 
@@ -3498,14 +3494,14 @@ int whisper_full(
                         memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                     }
 
-                    ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
-                    ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
+                    ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                    ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;
                 }
             }
 
             for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_real_start_sample_us = ggml_real_time_us();
-                const int64_t t_process_start_sample_us = ggml_process_time_us();
+                const int64_t t_start_real_us = ggml_time_real_us();
+                const int64_t t_start_proc_us = ggml_time_proc_us();
 
                 // store the KV caches of all decoders when doing beam-search
                 if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@@ -3698,8 +3694,8 @@ int whisper_full(
                     }
                 }
 
-                ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
-                ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
+                ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;
 
                 // obtain logits for the next token
                 for (int j = 0; j < n_decoders_cur; ++j) {
@@ -3720,15 +3716,15 @@ int whisper_full(
                     }
 
                     {
-                        const int64_t t_real_start_sample_us = ggml_real_time_us();
-                        const int64_t t_process_start_sample_us = ggml_process_time_us();
+                        const int64_t t_start_real_us = ggml_time_real_us();
+                        const int64_t t_start_proc_us = ggml_time_proc_us();
 
                         whisper_process_logits(*ctx, params, decoder, t_cur);
 
                         ++decoder.kv_self.n;
 
-                        ctx->t_real_sample_us += ggml_real_time_us() - t_real_start_sample_us;
-                        ctx->t_process_sample_us += ggml_process_time_us() - t_process_start_sample_us;
+                        ctx->t_sample_real_us += ggml_time_real_us() - t_start_real_us;
+                        ctx->t_sample_proc_us += ggml_time_proc_us() - t_start_proc_us;
                     }
                 }
             }
@@ -4021,14 +4017,14 @@ int whisper_full_parallel(
             }
         }
 
-        ctx->t_real_mel_us       += ctxs[i].t_real_mel_us;
-        ctx->t_process_mel_us    += ctxs[i].t_process_mel_us;
-        ctx->t_real_sample_us    += ctxs[i].t_real_sample_us;
-        ctx->t_process_sample_us += ctxs[i].t_process_sample_us;
-        ctx->t_real_encode_us    += ctxs[i].t_real_encode_us;
-        ctx->t_process_encode_us += ctxs[i].t_process_encode_us;
-        ctx->t_real_decode_us    += ctxs[i].t_real_decode_us;
-        ctx->t_process_decode_us += ctxs[i].t_process_decode_us;
+        ctx->t_mel_real_us    += ctxs[i].t_mel_real_us;
+        ctx->t_mel_proc_us    += ctxs[i].t_mel_proc_us;
+        ctx->t_sample_real_us += ctxs[i].t_sample_real_us;
+        ctx->t_sample_proc_us += ctxs[i].t_sample_proc_us;
+        ctx->t_encode_real_us += ctxs[i].t_encode_real_us;
+        ctx->t_encode_proc_us += ctxs[i].t_encode_proc_us;
+        ctx->t_decode_real_us += ctxs[i].t_decode_real_us;
+        ctx->t_decode_proc_us += ctxs[i].t_decode_proc_us;
 
         kv_cache_free(ctx->kv_cross);
 
@@ -4038,14 +4034,14 @@ int whisper_full_parallel(
     }
 
     // average the timings
-    ctx->t_real_mel_us       /= n_processors;
-    ctx->t_process_mel_us    /= n_processors;
-    ctx->t_real_sample_us    /= n_processors;
-    ctx->t_process_sample_us /= n_processors;
-    ctx->t_real_encode_us    /= n_processors;
-    ctx->t_process_encode_us /= n_processors;
-    ctx->t_real_decode_us    /= n_processors;
-    ctx->t_process_decode_us /= n_processors;
+    ctx->t_mel_real_us    /= n_processors;
+    ctx->t_mel_proc_us    /= n_processors;
+    ctx->t_sample_real_us /= n_processors;
+    ctx->t_sample_proc_us /= n_processors;
+    ctx->t_encode_real_us /= n_processors;
+    ctx->t_encode_proc_us /= n_processors;
+    ctx->t_decode_real_us /= n_processors;
+    ctx->t_decode_proc_us /= n_processors;
 
     // print information about the audio boundaries
     fprintf(stderr, "\n");