From 7aa1174315e93de4d0fc3fb2024304c66fe1d21c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Jan 2023 21:00:41 +0200
Subject: [PATCH] bench : fix Windows linkage by moving ggml benches in whisper
 lib ..

---
 examples/bench/bench.cpp | 139 ++------------------------------------
 whisper.cpp              | 140 +++++++++++++++++++++++++++++++++++++++
 whisper.h                |   7 ++
 3 files changed, 151 insertions(+), 135 deletions(-)
diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp
index 5f99774..49daaa0 100644
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@@ -1,11 +1,8 @@
-#include "ggml.h"
 #include "whisper.h"
 
 #include <cstdio>
-#include <cstring>
 #include <string>
 #include <thread>
-#include <vector>
 
 // command-line parameters
 struct whisper_params {
@@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "\n");
 }
 
-int bench_whisper_encoder(const whisper_params & params) {
+int whisper_bench_encoder(const whisper_params & params) {
     // whisper init
 
     struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) {
     return 0;
 }
 
-int bench_memcpy(const whisper_params & params) {
-    size_t n    = 50;
-    size_t arr  = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
-
-    // 1 GB array
-    const size_t size = arr*1024llu*1024llu;
-
-    char * src = (char *) malloc(size);
-    char * dst = (char *) malloc(size);
-
-    for (size_t i = 0; i < size; i++) src[i] = i;
-
-    memcpy(dst, src, size); // heat-up
-
-    double tsum = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        const int64_t t0 = ggml_time_us();
-
-        memcpy(dst, src, size);
-
-        const int64_t t1 = ggml_time_us();
-
-        tsum += (t1 - t0)*1e-6;
-
-        src[0] = rand();
-    }
-
-    fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
-
-    // needed to prevent the compile from optimizing the memcpy away
-    {
-        double sum = 0.0;
-
-        for (size_t i = 0; i < size; i++) sum += dst[i];
-
-        fprintf(stderr, "sum:    %s\n", sum == -536870910.00 ? "ok" : "error");
-    }
-
-    free(src);
-    free(dst);
-
-    return 0;
-}
-
-int bench_ggml_mul_mat(const whisper_params & params) {
-    const int n_max = 128;
-
-    const std::vector<size_t> sizes = {
-        64, 128, 256, 512, 1024, 2048, 4096,
-    };
-
-    const size_t N_max = sizes.back();
-
-    // a: N*N*sizeof(float)
-    // b: N*N*sizeof(float)
-    // c: N*N*sizeof(float)
-    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
-
-    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
-
-    for (int j = 0; j < (int) sizes.size(); j++) {
-        int n_fp16 = 0;
-        int n_fp32 = 0;
-
-        // GFLOPS/s
-        double s_fp16 = 0.0;
-        double s_fp32 = 0.0;
-
-        const size_t N = sizes[j];
-
-        for (int k = 0; k < 2; ++k) {
-            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
-            double & s = k == 0 ? s_fp16 : s_fp32;
-            int    & n = k == 0 ? n_fp16   : n_fp32;
-
-            struct ggml_init_params gparams = {
-                /*.mem_size   =*/ buf.size(),
-                /*.mem_buffer =*/ buf.data(),
-            };
-
-            struct ggml_context * ctx0 = ggml_init(gparams);
-
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
-            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
-
-            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
-
-            struct ggml_cgraph gf = ggml_build_forward(c);
-
-            gf.n_threads = params.n_threads;
-
-            double tsum = 0.0;
-
-            // heat-up
-            ggml_graph_compute(ctx0, &gf);
-
-            for (int i = 0; i < n_max; ++i) {
-                const int64_t t0 = ggml_time_us();
-
-                ggml_graph_compute(ctx0, &gf);
-
-                const int64_t t1 = ggml_time_us();
-
-                tsum += (t1 - t0)*1e-6;
-                n++;
-
-                if (tsum > 1.0 && n >= 3) {
-                    break;
-                }
-            }
-
-            ggml_free(ctx0);
-
-            s = ((2.0*N*N*N*n)/tsum)*1e-9;
-        }
-
-        fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
-            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
-    }
-
-    return 0;
-}
-
 int main(int argc, char ** argv) {
     whisper_params params;
 
@@ -229,14 +100,12 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    ggml_time_init();
-
     int ret = -1;
 
     switch (params.what) {
-        case 0: ret = bench_whisper_encoder(params); break;
-        case 1: ret = bench_memcpy(params);          break;
-        case 2: ret = bench_ggml_mul_mat(params);    break;
+        case 0: ret = whisper_bench_encoder(params);                break;
+        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
+        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
         default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
     }
 
diff --git a/whisper.cpp b/whisper.cpp
index 5aa3be1..f408c79 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -3801,6 +3801,7 @@ int whisper_full(
 
                     if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
                         const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
+
                         if (!text.empty()) {
                             const auto tt0 = params.speed_up ? 2*t0 : t0;
                             const auto tt1 = params.speed_up ? 2*t1 : t1;
@@ -4059,6 +4060,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
 
 // =================================================================================================
 
+//
+// Temporary interface needed for exposing ggml interface
+// Will be removed in the future when ggml becomes a separate library
+//
+
+WHISPER_API int whisper_bench_memcpy(int n_threads) {
+    ggml_time_init();
+
+    size_t n    = 50;
+    size_t arr  = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
+
+    // 1 GB array
+    const size_t size = arr*1024llu*1024llu;
+
+    char * src = (char *) malloc(size);
+    char * dst = (char *) malloc(size);
+
+    for (size_t i = 0; i < size; i++) src[i] = i;
+
+    memcpy(dst, src, size); // heat-up
+
+    double tsum = 0.0;
+
+    for (size_t i = 0; i < n; i++) {
+        const int64_t t0 = ggml_time_us();
+
+        memcpy(dst, src, size);
+
+        const int64_t t1 = ggml_time_us();
+
+        tsum += (t1 - t0)*1e-6;
+
+        src[0] = rand();
+    }
+
+    fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+
+    // needed to prevent the compile from optimizing the memcpy away
+    {
+        double sum = 0.0;
+
+        for (size_t i = 0; i < size; i++) sum += dst[i];
+
+        fprintf(stderr, "sum:    %s\n", sum == -536870910.00 ? "ok" : "error");
+    }
+
+    free(src);
+    free(dst);
+
+    return 0;
+}
+
+WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
+    ggml_time_init();
+
+    const int n_max = 128;
+
+    const std::vector<size_t> sizes = {
+        64, 128, 256, 512, 1024, 2048, 4096,
+    };
+
+    const size_t N_max = sizes.back();
+
+    // a: N*N*sizeof(float)
+    // b: N*N*sizeof(float)
+    // c: N*N*sizeof(float)
+    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+
+    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
+
+    for (int j = 0; j < (int) sizes.size(); j++) {
+        int n_fp16 = 0;
+        int n_fp32 = 0;
+
+        // GFLOPS/s
+        double s_fp16 = 0.0;
+        double s_fp32 = 0.0;
+
+        const size_t N = sizes[j];
+
+        for (int k = 0; k < 2; ++k) {
+            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+            double & s = k == 0 ? s_fp16 : s_fp32;
+            int    & n = k == 0 ? n_fp16   : n_fp32;
+
+            struct ggml_init_params gparams = {
+                /*.mem_size   =*/ buf.size(),
+                /*.mem_buffer =*/ buf.data(),
+            };
+
+            struct ggml_context * ctx0 = ggml_init(gparams);
+
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
+            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+
+            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
+
+            struct ggml_cgraph gf = ggml_build_forward(c);
+
+            gf.n_threads = n_threads;
+
+            double tsum = 0.0;
+
+            // heat-up
+            ggml_graph_compute(ctx0, &gf);
+
+            for (int i = 0; i < n_max; ++i) {
+                const int64_t t0 = ggml_time_us();
+
+                ggml_graph_compute(ctx0, &gf);
+
+                const int64_t t1 = ggml_time_us();
+
+                tsum += (t1 - t0)*1e-6;
+                n++;
+
+                if (tsum > 1.0 && n >= 3) {
+                    break;
+                }
+            }
+
+            ggml_free(ctx0);
+
+            s = ((2.0*N*N*N*n)/tsum)*1e-9;
+        }
+
+        fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+    }
+
+    return 0;
+}
+
+// =================================================================================================
+
+// =================================================================================================
+
 //
 // Experimental stuff below
 //
diff --git a/whisper.h b/whisper.h
index 59231e9..51a1888 100644
--- a/whisper.h
+++ b/whisper.h
@@ -350,6 +350,13 @@ extern "C" {
     // Get the probability of the specified token in the specified segment.
     WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
 
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Temporary helpers needed for exposing ggml interface
+
+    WHISPER_API int whisper_bench_memcpy(int n_threads);
+    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
+
 #ifdef __cplusplus
 }
 #endif