ggml : enable f32 SIMD for POWER9 ppc64le

pull/366/head
Thomas Fitzsimmons 3 years ago
parent cdbe556f81
commit eb87ee58e7

@ -526,39 +526,31 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
#elif defined(__POWER9_VECTOR__)
// TODO: uncomment this when it works
//#define GGML_SIMD
#define GGML_SIMD
// F32 POWER9
#define GGML_F32_STEP 32
#define GGML_F32_EPR 8
#define GGML_F32_EPR 4
// TODO: not tested !!
#define GGML_F32x4 __vector float
#define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
#define GGML_F32x4_LOAD vec_vsx_ld
#define GGML_F32x4_STORE vec_vsx_st
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
#define GGML_F32x4_ADD vec_add
#define GGML_F32x4_MUL vec_mul
#define GGML_F32x4_REDUCE(res, x) \
{ \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
x[2*i] = vec_add(x[2*i], x[2*i+1]); \
} \
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
x[4*i] = vec_add(x[4*i], x[4*i+2]); \
} \
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
x[8*i] = vec_add(x[8*i], x[8*i+4]); \
} \
res = vec_extract(x[0], 0) + \
vec_extract(x[0], 1) + \
vec_extract(x[0], 2) + \
vec_extract(x[0], 3); \
}
#define GGML_F32x4_ADD vec_add
#define GGML_F32x4_MUL vec_mul
#define GGML_F32x4_REDUCE(sumf, sum) \
sum[0] = vec_add(sum[0], sum[1]); \
sum[2] = vec_add(sum[2], sum[3]); \
sum[4] = vec_add(sum[4], sum[5]); \
sum[6] = vec_add(sum[6], sum[7]); \
sum[0] = vec_add(sum[0], sum[2]); \
sum[4] = vec_add(sum[4], sum[6]); \
sum[0] = vec_add(sum[0], sum[4]); \
sumf = vec_extract(sum[0], 0) + vec_extract(sum[0], 1) \
+ vec_extract(sum[0], 2) + vec_extract(sum[0], 3);
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
@ -571,12 +563,13 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
// F16 POWER9
#define GGML_F16_STEP 32
#define GGML_F16_EPR 4
#define GGML_F16_VEC vector float
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#define GGML_F16_VEC_ZERO 0.0f
#define GGML_F16_VEC_SET1 vec_splats
#define GGML_F16_STEP GGML_F32_STEP
#define GGML_F16_EPR GGML_F32_EPR
#define GGML_F16_VEC GGML_F32x4
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
// 1. Use vec_xl, not vec_ld, in case the load address is not aligned.
// 2. The load index is doubled because we're loading eight two-byte
// ggml_fp16_ts then converting them to four four-byte fp32s.
@ -587,17 +580,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
if (i & 0x1) \
vec_xst(vec_pack_to_short_fp32(r[i], r[i & ~0x1]), \
(i & ~0x1) * GGML_F16_EPR * 2, p)
#define GGML_F16_VEC_FMA(s, a, b) vec_madd(a, b, s)
#define GGML_F16_VEC_REDUCE(sumf, sum) \
sum[0] = vec_add(sum[0], sum[1]); \
sum[2] = vec_add(sum[2], sum[3]); \
sum[4] = vec_add(sum[4], sum[5]); \
sum[6] = vec_add(sum[6], sum[7]); \
sum[0] = vec_add(sum[0], sum[2]); \
sum[4] = vec_add(sum[4], sum[6]); \
sum[0] = vec_add(sum[0], sum[4]); \
sumf = vec_extract(sum[0], 0) + vec_extract(sum[0], 1) \
+ vec_extract(sum[0], 2) + vec_extract(sum[0], 3);
#elif defined(__wasm_simd128__)
@ -776,7 +758,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
ggml_float sumf = 0.0;
#if defined(GGML_SIMD) || defined(__POWER9_VECTOR__)
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
@ -841,7 +823,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
}
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
#if defined(GGML_SIMD) || defined(__POWER9_VECTOR__)
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);

Loading…
Cancel
Save