Add support for SSE3 SIMD

pull/368/head
Abitofevrything 3 years ago
parent 68daf6e487
commit c4d8664603

@ -81,6 +81,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M)))

@ -680,6 +680,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
#elif defined(__SSE3__)
#define GGML_SIMD
// F32 SSE
// TODO: 32?
#define GGML_F32_STEP 32
#define GGML_F32_EPR 4
#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO _mm_setzero_ps()
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
#define GGML_F32x4_LOAD _mm_loadu_ps
#define GGML_F32x4_STORE _mm_storeu_ps
#if defined(__FMA__)
// TODO: Does this work?
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
#else
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
#endif
#define GGML_F32x4_ADD _mm_add_ps
#define GGML_F32x4_MUL _mm_mul_ps
#define GGML_F32x4_REDUCE(res, x) \
{ \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
} \
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
} \
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
} \
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
}
// TODO: is this optimal ?
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
// F16 SSE
// TODO: 32?
#define GGML_F16_STEP 32
#define GGML_F16_EPR 4
inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]);
tmp[1] = GGML_FP16_TO_FP32(x[1]);
tmp[2] = GGML_FP16_TO_FP32(x[2]);
tmp[3] = GGML_FP16_TO_FP32(x[3]);
return _mm_loadu_ps(tmp);
}
inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
float arr[4];
_mm_storeu_ps(arr, y);
x[0] = GGML_FP32_TO_FP16(arr[0]);
x[1] = GGML_FP32_TO_FP16(arr[1]);
x[2] = GGML_FP32_TO_FP16(arr[2]);
x[3] = GGML_FP32_TO_FP16(arr[3]);
}
#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD _mm_add_ps
#define GGML_F32Cx4_MUL _mm_mul_ps
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
#define GGML_F16_VEC GGML_F32Cx4
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD
#define GGML_F16_VEC_STORE GGML_F32Cx4_STORE
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif
// GGML_F32_ARR / GGML_F16_ARR

Loading…
Cancel
Save