|
|
|
@ -680,6 +680,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
|
|
|
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
|
|
|
|
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
|
|
|
|
|
|
|
|
|
#elif defined(__SSE3__)
|
|
|
|
|
|
|
|
|
|
#define GGML_SIMD
|
|
|
|
|
|
|
|
|
|
// F32 SSE
|
|
|
|
|
// TODO: 32?
|
|
|
|
|
#define GGML_F32_STEP 32
|
|
|
|
|
#define GGML_F32_EPR 4
|
|
|
|
|
|
|
|
|
|
#define GGML_F32x4 __m128
|
|
|
|
|
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
|
|
|
|
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
|
|
|
|
#define GGML_F32x4_LOAD _mm_loadu_ps
|
|
|
|
|
#define GGML_F32x4_STORE _mm_storeu_ps
|
|
|
|
|
#if defined(__FMA__)
|
|
|
|
|
// TODO: Does this work?
|
|
|
|
|
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
|
|
|
|
#else
|
|
|
|
|
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
|
|
|
|
#endif
|
|
|
|
|
#define GGML_F32x4_ADD _mm_add_ps
|
|
|
|
|
#define GGML_F32x4_MUL _mm_mul_ps
|
|
|
|
|
#define GGML_F32x4_REDUCE(res, x) \
|
|
|
|
|
{ \
|
|
|
|
|
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
|
|
|
|
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
|
|
|
|
|
} \
|
|
|
|
|
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
|
|
|
|
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
|
|
|
|
|
} \
|
|
|
|
|
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
|
|
|
|
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
|
|
|
|
|
} \
|
|
|
|
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
|
|
|
|
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
|
|
|
|
}
|
|
|
|
|
// TODO: is this optimal ?
|
|
|
|
|
|
|
|
|
|
#define GGML_F32_VEC GGML_F32x4
|
|
|
|
|
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
|
|
|
|
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
|
|
|
|
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
|
|
|
|
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
|
|
|
|
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
|
|
|
|
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
|
|
|
|
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
|
|
|
|
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
|
|
|
|
|
|
|
|
|
// F16 SSE
|
|
|
|
|
// TODO: 32?
|
|
|
|
|
#define GGML_F16_STEP 32
|
|
|
|
|
#define GGML_F16_EPR 4
|
|
|
|
|
|
|
|
|
|
inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
|
|
|
|
float tmp[4];
|
|
|
|
|
|
|
|
|
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
|
|
|
|
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
|
|
|
|
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
|
|
|
|
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
|
|
|
|
|
|
|
|
|
return _mm_loadu_ps(tmp);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
|
|
|
float arr[4];
|
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(arr, y);
|
|
|
|
|
|
|
|
|
|
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
|
|
|
|
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
|
|
|
|
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
|
|
|
|
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define GGML_F32Cx4 __m128
|
|
|
|
|
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
|
|
|
|
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
|
|
|
|
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
|
|
|
|
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
|
|
|
|
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
|
|
|
|
#define GGML_F32Cx4_ADD _mm_add_ps
|
|
|
|
|
#define GGML_F32Cx4_MUL _mm_mul_ps
|
|
|
|
|
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
|
|
|
|
|
|
|
|
|
#define GGML_F16_VEC GGML_F32Cx4
|
|
|
|
|
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
|
|
|
|
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
|
|
|
|
#define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD
|
|
|
|
|
#define GGML_F16_VEC_STORE GGML_F32Cx4_STORE
|
|
|
|
|
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
|
|
|
|
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
|
|
|
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
|
|
|
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// GGML_F32_ARR / GGML_F16_ARR
|
|
|
|
|