From fbd513b813ea42a500ba92be3dcfea0b6b6a4fa3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 27 Oct 2022 18:31:49 +0300 Subject: [PATCH] Add OpenBLAS support Supported via CMake - just add: cmake .. -DWHISPER_SUPPORT_OPENBLAS=ON On Ubuntu, you have to install the library like this: apt install libopenblas-dev Unfortunately, I don't observe any benefit compared to the original AVX2 + FP16 implementation. Maybe I'm missing something --- CMakeLists.txt | 19 ++++++++++- ggml.c | 86 +++++++++++++++++++++++++------------------------- 2 files changed, 61 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 447c8b9..cb03af9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,8 +41,13 @@ option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STAND option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF) +if (APPLE) + option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF) +else() + option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) +endif() + option(WHISPER_PERF "whisper: enable perf timings" OFF) -option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF) # sanitizers @@ -86,6 +91,18 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE) endif() endif() +if (WHISPER_SUPPORT_OPENBLAS) + find_library(OPENBLAS_LIB openblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + # compiler flags if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) diff --git a/ggml.c b/ggml.c index 3a36802..e8384ed 100644 --- a/ggml.c +++ b/ggml.c @@ -76,6 +76,8 @@ typedef void* thread_ret_t; #ifdef GGML_USE_ACCELERATE #include +#elif GGML_USE_OPENBLAS +#include #endif // floating point type used to accumulate sums @@ -4055,46 +4057,44 @@ void ggml_compute_forward_mul_mat_f32( // nb00 < nb01 - src0 is transposed // compute by src0 columns -//#ifdef GGML_USE_ACCELERATE -// if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { -// GGML_ASSERT(ggml_is_contiguous(src0)); -// GGML_ASSERT(nb10 == sizeof(float)); -// -// if (params->ith != 0) return; -// -// if (params->type == GGML_TASK_INIT) { -// return; -// } -// -// if (params->type == GGML_TASK_FINALIZE) { -// return; -// } -// -// float * const wdata = params->wdata; -// -// for (int i03 = 0; i03 < ne03; i03++) { -// for (int i02 = 0; i02 < ne02; i02++) { -// const float * x = (float *) (src0->data); -// const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); -// -// float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -// -// // zT = y * xT -// { -// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, -// ne11, ne01, ne10, -// 1.0f, y, ne10, -// x, ne10, -// 0.0f, d, ne01); -// } -// } -// } -// -// //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); -// -// return; -// } -//#endif +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->ith != 0) return; + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + const float * x = (float *) (src0->data); + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01); + } + } + } + + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + return; + } +#endif if (params->type == GGML_TASK_INIT) { if (nb01 >= nb00) { @@ -4301,7 +4301,7 @@ void ggml_compute_forward_mul_mat_f16_f32( // nb00 < nb01 - src0 is transposed // compute by src0 columns -#ifdef GGML_USE_ACCELERATE +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -6857,7 +6857,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#ifdef GGML_USE_ACCELERATE +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]); } else { @@ -8074,7 +8074,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) return 1; #else return 0;