diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 5f99774..49daaa0 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -1,11 +1,8 @@ -#include "ggml.h" #include "whisper.h" #include -#include #include #include -#include // command-line parameters struct whisper_params { @@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, "\n"); } -int bench_whisper_encoder(const whisper_params & params) { +int whisper_bench_encoder(const whisper_params & params) { // whisper init struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); @@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) { return 0; } -int bench_memcpy(const whisper_params & params) { - size_t n = 50; - size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations - - // 1 GB array - const size_t size = arr*1024llu*1024llu; - - char * src = (char *) malloc(size); - char * dst = (char *) malloc(size); - - for (size_t i = 0; i < size; i++) src[i] = i; - - memcpy(dst, src, size); // heat-up - - double tsum = 0.0; - - for (size_t i = 0; i < n; i++) { - const int64_t t0 = ggml_time_us(); - - memcpy(dst, src, size); - - const int64_t t1 = ggml_time_us(); - - tsum += (t1 - t0)*1e-6; - - src[0] = rand(); - } - - fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); - - // needed to prevent the compile from optimizing the memcpy away - { - double sum = 0.0; - - for (size_t i = 0; i < size; i++) sum += dst[i]; - - fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error"); - } - - free(src); - free(dst); - - return 0; -} - -int bench_ggml_mul_mat(const whisper_params & params) { - const int n_max = 128; - - const std::vector sizes = { - 64, 128, 256, 512, 1024, 2048, 4096, - }; - - const size_t N_max = sizes.back(); - - // a: N*N*sizeof(float) - // b: N*N*sizeof(float) - // c: N*N*sizeof(float) - // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) - std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); - - for (size_t i = 0; i < buf.size(); i++) buf[i] = i; - - for (int j = 0; j < (int) sizes.size(); j++) { - int n_fp16 = 0; - int n_fp32 = 0; - - // GFLOPS/s - double s_fp16 = 0.0; - double s_fp32 = 0.0; - - const size_t N = sizes[j]; - - for (int k = 0; k < 2; ++k) { - const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; - - double & s = k == 0 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_fp16 : n_fp32; - - struct ggml_init_params gparams = { - /*.mem_size =*/ buf.size(), - /*.mem_buffer =*/ buf.data(), - }; - - struct ggml_context * ctx0 = ggml_init(gparams); - - struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N); - struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N); - - struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b); - - struct ggml_cgraph gf = ggml_build_forward(c); - - gf.n_threads = params.n_threads; - - double tsum = 0.0; - - // heat-up - ggml_graph_compute(ctx0, &gf); - - for (int i = 0; i < n_max; ++i) { - const int64_t t0 = ggml_time_us(); - - ggml_graph_compute(ctx0, &gf); - - const int64_t t1 = ggml_time_us(); - - tsum += (t1 - t0)*1e-6; - n++; - - if (tsum > 1.0 && n >= 3) { - break; - } - } - - ggml_free(ctx0); - - s = ((2.0*N*N*N*n)/tsum)*1e-9; - } - - fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", - N, N, s_fp16, n_fp16, s_fp32, n_fp32); - } - - return 0; -} - int main(int argc, char ** argv) { whisper_params params; @@ -229,14 +100,12 @@ int main(int argc, char ** argv) { return 1; } - ggml_time_init(); - int ret = -1; switch (params.what) { - case 0: ret = bench_whisper_encoder(params); break; - case 1: ret = bench_memcpy(params); break; - case 2: ret = bench_ggml_mul_mat(params); break; + case 0: ret = whisper_bench_encoder(params); break; + case 1: ret = whisper_bench_memcpy(params.n_threads); break; + case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break; default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break; } diff --git a/whisper.cpp b/whisper.cpp index 5aa3be1..f408c79 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3801,6 +3801,7 @@ int whisper_full( if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) { const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx)); + if (!text.empty()) { const auto tt0 = params.speed_up ? 2*t0 : t0; const auto tt1 = params.speed_up ? 2*t1 : t1; @@ -4059,6 +4060,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int // ================================================================================================= +// +// Temporary interface needed for exposing ggml interface +// Will be removed in the future when ggml becomes a separate library +// + +WHISPER_API int whisper_bench_memcpy(int n_threads) { + ggml_time_init(); + + size_t n = 50; + size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations + + // 1 GB array + const size_t size = arr*1024llu*1024llu; + + char * src = (char *) malloc(size); + char * dst = (char *) malloc(size); + + for (size_t i = 0; i < size; i++) src[i] = i; + + memcpy(dst, src, size); // heat-up + + double tsum = 0.0; + + for (size_t i = 0; i < n; i++) { + const int64_t t0 = ggml_time_us(); + + memcpy(dst, src, size); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0)*1e-6; + + src[0] = rand(); + } + + fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); + + // needed to prevent the compile from optimizing the memcpy away + { + double sum = 0.0; + + for (size_t i = 0; i < size; i++) sum += dst[i]; + + fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error"); + } + + free(src); + free(dst); + + return 0; +} + +WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { + ggml_time_init(); + + const int n_max = 128; + + const std::vector sizes = { + 64, 128, 256, 512, 1024, 2048, 4096, + }; + + const size_t N_max = sizes.back(); + + // a: N*N*sizeof(float) + // b: N*N*sizeof(float) + // c: N*N*sizeof(float) + // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) + std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); + + for (size_t i = 0; i < buf.size(); i++) buf[i] = i; + + for (int j = 0; j < (int) sizes.size(); j++) { + int n_fp16 = 0; + int n_fp32 = 0; + + // GFLOPS/s + double s_fp16 = 0.0; + double s_fp32 = 0.0; + + const size_t N = sizes[j]; + + for (int k = 0; k < 2; ++k) { + const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; + + double & s = k == 0 ? s_fp16 : s_fp32; + int & n = k == 0 ? n_fp16 : n_fp32; + + struct ggml_init_params gparams = { + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), + }; + + struct ggml_context * ctx0 = ggml_init(gparams); + + struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N); + struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N); + + struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b); + + struct ggml_cgraph gf = ggml_build_forward(c); + + gf.n_threads = n_threads; + + double tsum = 0.0; + + // heat-up + ggml_graph_compute(ctx0, &gf); + + for (int i = 0; i < n_max; ++i) { + const int64_t t0 = ggml_time_us(); + + ggml_graph_compute(ctx0, &gf); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0)*1e-6; + n++; + + if (tsum > 1.0 && n >= 3) { + break; + } + } + + ggml_free(ctx0); + + s = ((2.0*N*N*N*n)/tsum)*1e-9; + } + + fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", + N, N, s_fp16, n_fp16, s_fp32, n_fp32); + } + + return 0; +} + +// ================================================================================================= + +// ================================================================================================= + // // Experimental stuff below // diff --git a/whisper.h b/whisper.h index 59231e9..51a1888 100644 --- a/whisper.h +++ b/whisper.h @@ -350,6 +350,13 @@ extern "C" { // Get the probability of the specified token in the specified segment. WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token); + //////////////////////////////////////////////////////////////////////////// + + // Temporary helpers needed for exposing ggml interface + + WHISPER_API int whisper_bench_memcpy(int n_threads); + WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads); + #ifdef __cplusplus } #endif