From a62170c6562e7b75615ea350ab72f5c1e5b8d74e Mon Sep 17 00:00:00 2001 From: Abitofevrything <54505189+abitofevrything@users.noreply.github.com> Date: Fri, 6 Jan 2023 17:45:59 +0100 Subject: [PATCH] ggml : add SSE3 and fp16 conversion lookup table (#368) * Improves WASM performance: On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome * Add support for SSE3 SIMD * Add SSE3 to system information * Add Imath support for fp16-fp32 conversions * Add Imath to system information * Wrap Imath calls to avoid static function warnings * Drop Imath; Add lookup table for f16 -> f32 conversions * Remove TODO comments * Update SSE3 to new macro arguments * Correct updated macro definitions * Prefer static inline where possible * ggml : static inlines + add public f16 <-> f32 conversions Co-authored-by: Georgi Gerganov --- Makefile | 4 ++ ggml.c | 162 +++++++++++++++++++++++++++++++++++++++++++++------- ggml.h | 1 + whisper.cpp | 1 + 4 files changed, 147 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 0fc1768..a9d205a 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) ifneq (,$(findstring f16c,$(F16C_M))) CFLAGS += -mf16c endif + SSE3_M := $(shell grep "sse3 " /proc/cpuinfo) + ifneq (,$(findstring sse3,$(SSE3_M))) + CFLAGS += -msse3 + endif else ifeq ($(UNAME_S),Haiku) AVX1_M := $(shell sysinfo -cpu | grep "AVX ") ifneq (,$(findstring avx,$(AVX1_M))) diff --git a/ggml.c b/ggml.c index ac37b0b..70728fa 100644 --- a/ggml.c +++ b/ggml.c @@ -124,13 +124,8 @@ typedef double ggml_float; // #include -float ggml_fp16_to_fp32(ggml_fp16_t x) { - return x; -} - -ggml_fp16_t ggml_fp32_to_fp16(float x) { - return x; -} +#define GGML_COMPUTE_FP16_TO_FP32(x) (x) +#define GGML_COMPUTE_FP32_TO_FP16(x) (x) #define GGML_FP16_TO_FP32(x) (x) #define GGML_FP32_TO_FP16(x) (x) @@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { #endif #ifdef __F16C__ -float ggml_fp16_to_fp32(ggml_fp16_t h) { - return _cvtsh_ss(h); -} -ggml_fp16_t ggml_fp32_to_fp16(float f) { - return _cvtss_sh(f, 0); -} -#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) #else @@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) { return fp32.as_bits; } -float ggml_fp16_to_fp32(ggml_fp16_t h) { +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; @@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) { return fp32_from_bits(result); } -ggml_fp16_t ggml_fp32_to_fp16(float f) { +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; @@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) { return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } -#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #endif // __F16C__ @@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16]; // precomputed exp table for f16 (128 KB) static ggml_fp16_t table_exp_f16[1 << 16]; +// precomputed f32 table for f16 (256 KB) +static float table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) + +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +#endif + +// note: do not use these inside ggml.c +// these are meant to be used via the ggml.h API +float ggml_fp16_to_fp32(ggml_fp16_t x) { + return GGML_FP16_TO_FP32(x); +} + +ggml_fp16_t ggml_fp32_to_fp16(float x) { + return GGML_FP32_TO_FP16(x); +} + // // timing // @@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F16_VEC_MUL GGML_F16x4_MUL #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE +#elif defined(__SSE3__) + +#define GGML_SIMD + +// F32 SSE + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 __m128 +#define GGML_F32x4_ZERO _mm_setzero_ps() +#define GGML_F32x4_SET1(x) _mm_set1_ps(x) +#define GGML_F32x4_LOAD _mm_loadu_ps +#define GGML_F32x4_STORE _mm_storeu_ps +#if defined(__FMA__) + // TODO: Does this work? + #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) +#else + #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) +#endif +#define GGML_F32x4_ADD _mm_add_ps +#define GGML_F32x4_MUL _mm_mul_ps +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \ + } \ + const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ +} +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 SSE + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 4 + +static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(x[0]); + tmp[1] = GGML_FP16_TO_FP32(x[1]); + tmp[2] = GGML_FP16_TO_FP32(x[2]); + tmp[3] = GGML_FP16_TO_FP32(x[3]); + + return _mm_loadu_ps(tmp); +} + +static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { + float arr[4]; + + _mm_storeu_ps(arr, y); + + x[0] = GGML_FP32_TO_FP16(arr[0]); + x[1] = GGML_FP32_TO_FP16(arr[1]); + x[2] = GGML_FP32_TO_FP16(arr[2]); + x[3] = GGML_FP32_TO_FP16(arr[3]); +} + +#define GGML_F32Cx4 __m128 +#define GGML_F32Cx4_ZERO _mm_setzero_ps() +#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) +#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define GGML_F32Cx4_FMA GGML_F32x4_FMA +#define GGML_F32Cx4_ADD _mm_add_ps +#define GGML_F32Cx4_MUL _mm_mul_ps +#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + +#define GGML_F16_VEC GGML_F32Cx4 +#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + #endif // GGML_F32_ARR / GGML_F16_ARR @@ -1269,7 +1381,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { static bool is_first_call = true; if (is_first_call) { - // initialize GELU and EXP tables + // initialize GELU, EXP and F32 tables { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); @@ -1277,7 +1389,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { for (int i = 0; i < (1 << 16); ++i) { uint16_t ui = i; memcpy(&ii, &ui, sizeof(ii)); - const float f = GGML_FP16_TO_FP32(ii); + const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); } @@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) { #endif } +int ggml_cpu_has_sse3(void) { +#if defined(__SSE3__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; diff --git a/ggml.h b/ggml.h index d066db9..f3c9e5a 100644 --- a/ggml.h +++ b/ggml.h @@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void); int ggml_cpu_has_fp16_va(void); int ggml_cpu_has_wasm_simd(void); int ggml_cpu_has_blas(void); +int ggml_cpu_has_sse3(void); int ggml_cpu_has_vsx(void); #ifdef __cplusplus diff --git a/whisper.cpp b/whisper.cpp index b8c3acc..e8d9f0c 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) { s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str();