From aa198f617e310e37e16483d34a4375c3794d4b78 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Dec 2022 10:16:42 +0200 Subject: [PATCH] ggml : generic reduce for all register sizes + comments --- ggml.c | 367 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 223 insertions(+), 144 deletions(-) diff --git a/ggml.c b/ggml.c index ba03cf9..232ae28 100644 --- a/ggml.c +++ b/ggml.c @@ -316,33 +316,28 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // -// fundamental operations +// simd mappings // -inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } -inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } -inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } -inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } -inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } -inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } -inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } +// we define a common set of C macros which map to specific intrinsics based on the current architecture +// we then implement the fundamental computation operations below using only these macros +// adding support for new architectures requires to define the corresponding SIMD macros +// +// GGML_F32_STEP / GGML_F16_STEP +// number of elements to process in a single step +// +// GGML_F32_EPR / GGML_F16_EPR +// number of elements to fit in a single register +// #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) + #define GGML_SIMD +// F32 NEON + #define GGML_F32_STEP 16 -#define GGML_F32_EPS 4 -#define GGML_F32_ARR 4 +#define GGML_F32_EPR 4 #define GGML_F32x4 float32x4_t #define GGML_F32x4_ZERO vdupq_n_f32(0.0f) @@ -353,16 +348,27 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 #if defined(__ARM_FEATURE_QRDMX) -#define GGML_F32x4_REDUCE4(res, x) res = vaddvq_f32(vaddq_f32(vaddq_f32(x[0], x[1]), vaddq_f32(x[2], x[3]))) + #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #else -#define GGML_F32x4_REDUCE4(res, x) \ -{ \ - x[0] = vaddq_f32(x[0], x[1]); \ - x[2] = vaddq_f32(x[2], x[3]); \ - x[0] = vaddq_f32(x[0], x[2]); \ - res = vgetq_lane_f32(x[0], 0) + vgetq_lane_f32(x[0], 1) + vgetq_lane_f32(x[0], 2) + vgetq_lane_f32(x[0], 3); \ -} + #define GGML_F32x4_REDUCE_ONE(x) \ + (vgetq_lane_f32(x, 0) + \ + vgetq_lane_f32(x, 1) + \ + vgetq_lane_f32(x, 2) + \ + vgetq_lane_f32(x, 3)) #endif +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \ + } \ + res = GGML_F32x4_REDUCE_ONE(x[0]); \ +} #define GGML_F32_VEC GGML_F32x4 #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO @@ -372,12 +378,13 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32_VEC_FMA GGML_F32x4_FMA #define GGML_F32_VEC_ADD GGML_F32x4_ADD #define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE4 +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 NEON #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define GGML_F16_STEP 32 - #define GGML_F16_EPS 8 - #define GGML_F16_ARR 4 + #define GGML_F16_EPR 8 #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) @@ -387,15 +394,20 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE4(res, x) \ - { \ - x[0] = vaddq_f16(x[0], x[1]); \ - x[2] = vaddq_f16(x[2], x[3]); \ - x[0] = vaddq_f16(x[0], x[2]); \ - float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ - float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - t0 = vaddq_f32 (t0, t1); \ - res = vaddvq_f32(t0); \ + #define GGML_F16x8_REDUCE(res, x) \ + { \ + for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ + x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ + x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ + x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ + res = vaddvq_f32(vaddq_f32(t0, t1)); \ } #define GGML_F16_VEC GGML_F16x8 @@ -406,40 +418,43 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_MUL GGML_F16x8_MUL - #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE4 + #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE #else + // if FP16 vector arithmetic is not supported, we use FP32 instead + // and take advantage of the vcvt_ functions to convert to/from FP16 + #define GGML_F16_STEP 16 - #define GGML_F16_EPS 4 - #define GGML_F16_ARR 4 - - #define GGML_F16x4 float32x4_t - #define GGML_F16x4_ZERO vdupq_n_f32(0.0f) - #define GGML_F16x4_SET1(x) vdupq_n_f32(x) - #define GGML_F16x4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) - #define GGML_F16x4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define GGML_F16x4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define GGML_F16x4_ADD vaddq_f32 - #define GGML_F16x4_MUL vmulq_f32 - #define GGML_F16x4_REDUCE4 GGML_F32x4_REDUCE4 - - #define GGML_F16_VEC GGML_F16x4 - #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO - #define GGML_F16_VEC_SET1 GGML_F16x4_SET1 - #define GGML_F16_VEC_LOAD GGML_F16x4_LOAD - #define GGML_F16_VEC_STORE GGML_F16x4_STORE - #define GGML_F16_VEC_FMA GGML_F16x4_FMA - #define GGML_F16_VEC_ADD GGML_F16x4_ADD - #define GGML_F16_VEC_MUL GGML_F16x4_MUL - #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE4 + #define GGML_F16_EPR 4 + + #define GGML_F32Cx4 float32x4_t + #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define GGML_F32Cx4_ADD vaddq_f32 + #define GGML_F32Cx4_MUL vmulq_f32 + #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + + #define GGML_F16_VEC GGML_F32Cx4 + #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO + #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 + #define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD + #define GGML_F16_VEC_STORE GGML_F32Cx4_STORE + #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA + #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD + #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL + #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif #elif defined(__AVX__) #define GGML_SIMD +// F32 AVX + #define GGML_F32_STEP 32 -#define GGML_F32_EPS 8 -#define GGML_F32_ARR 4 +#define GGML_F32_EPR 8 #define GGML_F32x8 __m256 #define GGML_F32x8_ZERO _mm256_setzero_ps() @@ -453,13 +468,21 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #endif #define GGML_F32x8_ADD _mm256_add_ps #define GGML_F32x8_MUL _mm256_mul_ps -#define GGML_F32x8_REDUCE4(res, x) \ -{ \ - x[0] = _mm256_add_ps(x[0], x[1]); \ - x[2] = _mm256_add_ps(x[2], x[3]); \ - x[0] = _mm256_add_ps(x[0], x[2]); \ - const __m128 t0 = _mm_add_ps(_mm256_extractf128_ps(x[0], 0), _mm256_extractf128_ps(x[0], 1)); \ - res = _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(t0, t0), t0)); \ +#define GGML_F32x8_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \ + } \ + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ + _mm256_extractf128_ps(x[0], 1)); \ + const __m128 t1 = _mm_hadd_ps(t0, t0); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ } // TODO: is this optimal ? @@ -471,40 +494,45 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32_VEC_FMA GGML_F32x8_FMA #define GGML_F32_VEC_ADD GGML_F32x8_ADD #define GGML_F32_VEC_MUL GGML_F32x8_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE4 +#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE + +// F16 AVX #define GGML_F16_STEP 32 -#define GGML_F16_EPS 8 -#define GGML_F16_ARR 4 - -#define GGML_F16x8 __m256 -#define GGML_F16x8_ZERO _mm256_setzero_ps() -#define GGML_F16x8_SET1(x) _mm256_set1_ps(x) -#define GGML_F16x8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) -#define GGML_F16x8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) -#define GGML_F16x8_FMA GGML_F32x8_FMA -#define GGML_F16x8_ADD _mm256_add_ps -#define GGML_F16x8_MUL _mm256_mul_ps -#define GGML_F16x8_REDUCE4 GGML_F32x8_REDUCE4 - -#define GGML_F16_VEC GGML_F16x8 -#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO -#define GGML_F16_VEC_SET1 GGML_F16x8_SET1 -#define GGML_F16_VEC_LOAD GGML_F16x8_LOAD -#define GGML_F16_VEC_STORE GGML_F16x8_STORE -#define GGML_F16_VEC_FMA GGML_F16x8_FMA -#define GGML_F16_VEC_ADD GGML_F16x8_ADD -#define GGML_F16_VEC_MUL GGML_F16x8_MUL -#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE4 +#define GGML_F16_EPR 8 + +// F16 arithmetic is not supported by AVX, so we use F32 instead +// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32 + +#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8_ZERO _mm256_setzero_ps() +#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) +#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#define GGML_F32Cx8_FMA GGML_F32x8_FMA +#define GGML_F32Cx8_ADD _mm256_add_ps +#define GGML_F32Cx8_MUL _mm256_mul_ps +#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE + +#define GGML_F16_VEC GGML_F32Cx8 +#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 +#define GGML_F16_VEC_LOAD GGML_F32Cx8_LOAD +#define GGML_F16_VEC_STORE GGML_F32Cx8_STORE +#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE #elif defined(__POWER9_VECTOR__) // TODO: uncomment this when it works //#define GGML_SIMD +// F32 POWER9 + #define GGML_F32_STEP 32 -#define GGML_F32_EPS 8 -#define GGML_F32_ARR 4 +#define GGML_F32_EPR 8 // TODO: not tested !! #define GGML_F32x4 __vector float @@ -515,15 +543,21 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) #define GGML_F32x4_ADD vec_add #define GGML_F32x4_MUL vec_mul -#define GGML_F32x4_REDUCE4(res, x) \ -{ \ - x[0] = vec_add(x[0], x[1]); \ - x[2] = vec_add(x[2], x[3]); \ - x[0] = vec_add(x[0], x[2]); \ - res = vec_extract(x[0], 0) + \ - vec_extract(x[0], 1) + \ - vec_extract(x[0], 2) + \ - vec_extract(x[0], 3); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + x[2*i] = vec_add(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + x[4*i] = vec_add(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + x[8*i] = vec_add(x[8*i], x[8*i+4]); \ + } \ + res = vec_extract(x[0], 0) + \ + vec_extract(x[0], 1) + \ + vec_extract(x[0], 2) + \ + vec_extract(x[0], 3); \ } #define GGML_F32_VEC GGML_F32x4 @@ -534,17 +568,20 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32_VEC_FMA GGML_F32x4_FMA #define GGML_F32_VEC_ADD GGML_F32x4_ADD #define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE4 +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE -// TODO: implement F16 .. +// F16 POWER9 +// TODO: implement here +// ... #elif defined(__wasm_simd128__) #define GGML_SIMD +// F32 WASM + #define GGML_F32_STEP 16 -#define GGML_F32_EPS 4 -#define GGML_F32_ARR 4 +#define GGML_F32_EPR 4 #define GGML_F32x4 v128_t #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) @@ -554,15 +591,21 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) #define GGML_F32x4_ADD wasm_f32x4_add #define GGML_F32x4_MUL wasm_f32x4_mul -#define GGML_F32x4_REDUCE4(res, x) \ -{ \ - x[0] = wasm_f32x4_add(x[0], x[1]); \ - x[2] = wasm_f32x4_add(x[2], x[3]); \ - x[0] = wasm_f32x4_add(x[0], x[2]); \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ } #define GGML_F32_VEC GGML_F32x4 @@ -573,11 +616,12 @@ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, co #define GGML_F32_VEC_FMA GGML_F32x4_FMA #define GGML_F32_VEC_ADD GGML_F32x4_ADD #define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE4 +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 WASM #define GGML_F16_STEP 16 -#define GGML_F16_EPS 4 -#define GGML_F16_ARR 4 +#define GGML_F16_EPR 4 inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { float tmp[4]; @@ -609,15 +653,21 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F16x4_FMA GGML_F32x4_FMA #define GGML_F16x4_ADD wasm_f32x4_add #define GGML_F16x4_MUL wasm_f32x4_mul -#define GGML_F16x4_REDUCE4(res, x) \ -{ \ - x[0] = wasm_f32x4_add(x[0], x[1]); \ - x[2] = wasm_f32x4_add(x[2], x[3]); \ - x[0] = wasm_f32x4_add(x[0], x[2]); \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ +#define GGML_F16x4_REDUCE(res, x) \ +{ \ + for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ + x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ + } \ + for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ + x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ + } \ + for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ + x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ } #define GGML_F16_VEC GGML_F16x4 @@ -628,10 +678,39 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F16_VEC_FMA GGML_F16x4_FMA #define GGML_F16_VEC_ADD GGML_F16x4_ADD #define GGML_F16_VEC_MUL GGML_F16x4_MUL -#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE4 +#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE #endif +// GGML_F32_ARR / GGML_F16_ARR +// number of registers to use per step +#ifdef GGML_SIMD +#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) +#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) +#endif + +// +// fundamental operations +// + +inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { ggml_float sumf = 0.0; @@ -645,8 +724,8 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float for (int i = 0; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPS); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPS); + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); } @@ -682,8 +761,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t for (int i = 0; i < np; i += GGML_F16_STEP) { for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPS); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPS); + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR); sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); } @@ -769,11 +848,11 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float for (int i = 0; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPS); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPS); + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPS, ay[j]); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); } } @@ -800,11 +879,11 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_ for (int i = 0; i < np; i += GGML_F16_STEP) { for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPS); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPS); + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR); ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPS, ay[j]); + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay[j]); } } @@ -890,10 +969,10 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; j++) { - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPS); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); ay[j] = GGML_F32_VEC_MUL(ay[j], vx); - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPS, ay[j]); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); } }