@ -124,13 +124,8 @@ typedef double ggml_float;
//
# include <arm_neon.h>
float ggml_fp16_to_fp32 ( ggml_fp16_t x ) {
return x ;
}
ggml_fp16_t ggml_fp32_to_fp16 ( float x ) {
return x ;
}
# define GGML_COMPUTE_FP16_TO_FP32(x) (x)
# define GGML_COMPUTE_FP32_TO_FP16(x) (x)
# define GGML_FP16_TO_FP32(x) (x)
# define GGML_FP32_TO_FP16(x) (x)
@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
# endif
# ifdef __F16C__
float ggml_fp16_to_fp32 ( ggml_fp16_t h ) {
return _cvtsh_ss ( h ) ;
}
ggml_fp16_t ggml_fp32_to_fp16 ( float f ) {
return _cvtss_sh ( f , 0 ) ;
}
# define GGML_ FP16_TO_FP32(x) _cvtsh_ss(x)
# define GGML_ FP32_TO_FP16(x) _cvtss_sh(x, 0)
# define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
# define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
# else
@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
return fp32 . as_bits ;
}
float ggml _fp16_to_fp32( ggml_fp16_t h ) {
static inline float ggml _compute _fp16_to_fp32( ggml_fp16_t h ) {
const uint32_t w = ( uint32_t ) h < < 16 ;
const uint32_t sign = w & UINT32_C ( 0x80000000 ) ;
const uint32_t two_w = w + w ;
@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
return fp32_from_bits ( result ) ;
}
ggml_fp16_t ggml _fp32_to_fp16( float f ) {
static inline ggml_fp16_t ggml _compute _fp32_to_fp16( float f ) {
# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1 .0 p + 112f ;
const float scale_to_zero = 0x1 .0 p - 110f ;
@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
return ( sign > > 16 ) | ( shl1_w > UINT32_C ( 0xFF000000 ) ? UINT16_C ( 0x7E00 ) : nonsign ) ;
}
# define GGML_ FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
# define GGML_ FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
# define GGML_ COMPUTE_ FP16_TO_FP32(x) ggml_compute _fp16_to_fp32(x)
# define GGML_ COMPUTE_ FP32_TO_FP16(x) ggml_compute _fp32_to_fp16(x)
# endif // __F16C__
@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
// precomputed exp table for f16 (128 KB)
static ggml_fp16_t table_exp_f16 [ 1 < < 16 ] ;
// precomputed f32 table for f16 (256 KB)
static float table_f32_f16 [ 1 < < 16 ] ;
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
# if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
inline static float ggml_lookup_fp16_to_fp32 ( ggml_fp16_t f ) {
uint16_t s ;
memcpy ( & s , & f , sizeof ( uint16_t ) ) ;
return table_f32_f16 [ s ] ;
}
# define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
# define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
# endif
// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32 ( ggml_fp16_t x ) {
return GGML_FP16_TO_FP32 ( x ) ;
}
ggml_fp16_t ggml_fp32_to_fp16 ( float x ) {
return GGML_FP32_TO_FP16 ( x ) ;
}
//
// timing
//
@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
# define GGML_F16_VEC_MUL GGML_F16x4_MUL
# define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
# elif defined(__SSE3__)
# define GGML_SIMD
// F32 SSE
# define GGML_F32_STEP 32
# define GGML_F32_EPR 4
# define GGML_F32x4 __m128
# define GGML_F32x4_ZERO _mm_setzero_ps()
# define GGML_F32x4_SET1(x) _mm_set1_ps(x)
# define GGML_F32x4_LOAD _mm_loadu_ps
# define GGML_F32x4_STORE _mm_storeu_ps
# if defined(__FMA__)
// TODO: Does this work?
# define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
# else
# define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
# endif
# define GGML_F32x4_ADD _mm_add_ps
# define GGML_F32x4_MUL _mm_mul_ps
# define GGML_F32x4_REDUCE(res, x) \
{ \
for ( int i = 0 ; i < GGML_F32_ARR / 2 ; + + i ) { \
x [ 2 * i ] = _mm_add_ps ( x [ 2 * i ] , x [ 2 * i + 1 ] ) ; \
} \
for ( int i = 0 ; i < GGML_F32_ARR / 4 ; + + i ) { \
x [ 4 * i ] = _mm_add_ps ( x [ 4 * i ] , x [ 4 * i + 2 ] ) ; \
} \
for ( int i = 0 ; i < GGML_F32_ARR / 8 ; + + i ) { \
x [ 8 * i ] = _mm_add_ps ( x [ 8 * i ] , x [ 8 * i + 4 ] ) ; \
} \
const __m128 t0 = _mm_hadd_ps ( x [ 0 ] , x [ 0 ] ) ; \
res = _mm_cvtss_f32 ( _mm_hadd_ps ( t0 , t0 ) ) ; \
}
// TODO: is this optimal ?
# define GGML_F32_VEC GGML_F32x4
# define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
# define GGML_F32_VEC_SET1 GGML_F32x4_SET1
# define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
# define GGML_F32_VEC_STORE GGML_F32x4_STORE
# define GGML_F32_VEC_FMA GGML_F32x4_FMA
# define GGML_F32_VEC_ADD GGML_F32x4_ADD
# define GGML_F32_VEC_MUL GGML_F32x4_MUL
# define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
// F16 SSE
# define GGML_F16_STEP 32
# define GGML_F16_EPR 4
static inline __m128 __sse_f16x4_load ( ggml_fp16_t * x ) {
float tmp [ 4 ] ;
tmp [ 0 ] = GGML_FP16_TO_FP32 ( x [ 0 ] ) ;
tmp [ 1 ] = GGML_FP16_TO_FP32 ( x [ 1 ] ) ;
tmp [ 2 ] = GGML_FP16_TO_FP32 ( x [ 2 ] ) ;
tmp [ 3 ] = GGML_FP16_TO_FP32 ( x [ 3 ] ) ;
return _mm_loadu_ps ( tmp ) ;
}
static inline void __sse_f16x4_store ( ggml_fp16_t * x , __m128 y ) {
float arr [ 4 ] ;
_mm_storeu_ps ( arr , y ) ;
x [ 0 ] = GGML_FP32_TO_FP16 ( arr [ 0 ] ) ;
x [ 1 ] = GGML_FP32_TO_FP16 ( arr [ 1 ] ) ;
x [ 2 ] = GGML_FP32_TO_FP16 ( arr [ 2 ] ) ;
x [ 3 ] = GGML_FP32_TO_FP16 ( arr [ 3 ] ) ;
}
# define GGML_F32Cx4 __m128
# define GGML_F32Cx4_ZERO _mm_setzero_ps()
# define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
# define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
# define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
# define GGML_F32Cx4_FMA GGML_F32x4_FMA
# define GGML_F32Cx4_ADD _mm_add_ps
# define GGML_F32Cx4_MUL _mm_mul_ps
# define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
# define GGML_F16_VEC GGML_F32Cx4
# define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
# define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
# define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
# define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
# define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
# define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
# define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
# define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
# endif
// GGML_F32_ARR / GGML_F16_ARR
@ -1269,7 +1381,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
static bool is_first_call = true ;
if ( is_first_call ) {
// initialize GELU and EXP tables
// initialize GELU , EXP and F32 tables
{
const uint64_t t_start = ggml_time_us ( ) ; UNUSED ( t_start ) ;
@ -1277,7 +1389,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
for ( int i = 0 ; i < ( 1 < < 16 ) ; + + i ) {
uint16_t ui = i ;
memcpy ( & ii , & ui , sizeof ( ii ) ) ;
const float f = GGML _FP16_TO_FP32( ii ) ;
const float f = table_f32_f16 [ i ] = GGML _COMPUTE _FP16_TO_FP32( ii ) ;
table_gelu_f16 [ i ] = GGML_FP32_TO_FP16 ( ggml_gelu_f32 ( f ) ) ;
table_exp_f16 [ i ] = GGML_FP32_TO_FP16 ( exp ( f ) ) ;
}
@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
# endif
}
int ggml_cpu_has_sse3 ( void ) {
# if defined(__SSE3__)
return 1 ;
# else
return 0 ;
# endif
}
int ggml_cpu_has_vsx ( void ) {
# if defined(__POWER9_VECTOR__)
return 1 ;