diff --git a/tests/test-ggml.cpp b/tests/test-ggml.cpp new file mode 100755 index 0000000..550a1a8 --- /dev/null +++ b/tests/test-ggml.cpp @@ -0,0 +1,1166 @@ +// ggml SIMD/AVX test +// based on https://gist.github.com/belltailjp/4653695 +// belltailjp/simd.cpp +// +// https://github.com/ggerganov/whisper.cpp +// +// gcc -O3 -std=c++11 -mavx -mf16c -o test-ggml test-ggml.cpp -lstdc++ -lm +// gcc -O3 -std=c++11 -mavx -mf16c -mavx2 -mfma -o test-ggml test-ggml.cpp -lstdc++ -lm +// gcc -Ofast -std=c++11 -mavx -mf16c -o test-ggml test-ggml.cpp -lstdc++ -lm +// gcc -Ofast -std=c++11 -mavx -mf16c -mavx2 -mfma -o test-ggml test-ggml.cpp -lstdc++ -lm + +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// from ggml.h +typedef uint16_t ggml_fp16_t; + +// from ggml.c +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +typedef double ggml_float; + + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32 = { w }; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32 = { f }; + return fp32.as_bits; +} + +inline float ggml_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +inline ggml_fp16_t ggml_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + + +#ifdef __F16C__ +inline float ggml_fp16_to_fp32_f16c(ggml_fp16_t h) { + return _cvtsh_ss(h); +} +inline ggml_fp16_t ggml_fp32_to_fp16_f16c(float f) { + return _cvtss_sh(f, 0); +} +#endif + + +//inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +//inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +//inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +//inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +//inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +#ifdef __AVX2__ +void ggml_vec_add_f32_avx2 (const int n, float * z, const float * x, const float * y) +{ + // AVX 256-bit + const int n32 = (n & ~31); + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_add_ps(x0, y0); + y1 = _mm256_add_ps(x1, y1); + y2 = _mm256_add_ps(x2, y2); + y3 = _mm256_add_ps(x3, y3); + + _mm256_storeu_ps(z + i + 0, y0); + _mm256_storeu_ps(z + i + 8, y1); + _mm256_storeu_ps(z + i + 16, y2); + _mm256_storeu_ps(z + i + 24, y3); + } + // leftovers + for (int i = n32; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} +#endif + +void ggml_vec_add_f32_avx (const int n, float * z, const float * x, const float * y) +{ + // AVX 256-bit + const int n32 = (n & ~31); + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_add_ps(x0, y0); + y1 = _mm256_add_ps(x1, y1); + y2 = _mm256_add_ps(x2, y2); + y3 = _mm256_add_ps(x3, y3); + + _mm256_storeu_ps(z + i + 0, y0); + _mm256_storeu_ps(z + i + 8, y1); + _mm256_storeu_ps(z + i + 16, y2); + _mm256_storeu_ps(z + i + 24, y3); + } + // leftovers + for (int i = n32; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +void ggml_vec_add_f32_normal (const int n, float * z, const float * x, const float * y) +{ + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +//inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +//inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +//inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +//inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +//inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +//inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +//inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +//inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + + +// inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) +#ifdef __AVX2__ +void ggml_vec_dot_f32_avx2(const int n, float *s, const float *x, const float *y) +{ + ggml_float sumf = 0.0; + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + sum0 = _mm256_fmadd_ps(x0, y0, sum0); + sum1 = _mm256_fmadd_ps(x1, y1, sum1); + sum2 = _mm256_fmadd_ps(x2, y2, sum2); + sum3 = _mm256_fmadd_ps(x3, y3, sum3); + } + + sum0 = _mm256_add_ps(sum0, sum1); + sum2 = _mm256_add_ps(sum2, sum3); + sum0 = _mm256_add_ps(sum0, sum2); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + sumf += x[i]*y[i]; + } + *s = sumf; +} +#endif + +void ggml_vec_dot_f32_avx(const int n, float *s, const float *x, const float *y) +{ + ggml_float sumf = 0.0; + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3); + } + + sum0 = _mm256_add_ps(sum0, sum1); + sum2 = _mm256_add_ps(sum2, sum3); + sum0 = _mm256_add_ps(sum0, sum2); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + sumf += x[i]*y[i]; + } + *s = sumf; +} + +void ggml_vec_dot_f32_normal(const int n, float *s, const float *x, const float *y) +{ + ggml_float sumf = 0.0; + // scalar + for (int i = 0; i < n; ++i) { + sumf += x[i]*y[i]; + } + *s = sumf; +} + + +// inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) +#ifdef __AVX2__ +void ggml_vec_dot_f16_avx2(const int n, float *s, ggml_fp16_t *x, ggml_fp16_t *y) +{ + ggml_float sumf = 0.0; + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + sum0 = _mm256_fmadd_ps(x0, y0, sum0); + sum1 = _mm256_fmadd_ps(x1, y1, sum1); + sum2 = _mm256_fmadd_ps(x2, y2, sum2); + sum3 = _mm256_fmadd_ps(x3, y3, sum3); + } + + const __m256 sum01 = _mm256_add_ps(sum0, sum1); + const __m256 sum23 = _mm256_add_ps(sum2, sum3); + const __m256 sum0123 = _mm256_add_ps(sum01, sum23); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + //GGML_ASSERT(false); + sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]); + } + *s = sumf; +} +#endif + +void ggml_vec_dot_f16_avx(const int n, float *s, ggml_fp16_t *x, ggml_fp16_t *y) +{ + ggml_float sumf = 0.0; + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3); + } + + const __m256 sum01 = _mm256_add_ps(sum0, sum1); + const __m256 sum23 = _mm256_add_ps(sum2, sum3); + const __m256 sum0123 = _mm256_add_ps(sum01, sum23); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + //GGML_ASSERT(false); + sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]); + } + *s = sumf; +} + +void ggml_vec_dot_f16_normal(const int n, float *s, ggml_fp16_t *x, ggml_fp16_t *y) +{ + ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]); + } + *s = sumf; +} + + +// inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) +#ifdef __AVX2__ +void ggml_vec_mad_f32_avx2(const int n, float *y, const float *x, const float v) +{ + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v4 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_fmadd_ps(x0, v4, y0); + y1 = _mm256_fmadd_ps(x1, v4, y1); + y2 = _mm256_fmadd_ps(x2, v4, y2); + y3 = _mm256_fmadd_ps(x3, v4, y3); + + _mm256_storeu_ps(y + i + 0, y0); + _mm256_storeu_ps(y + i + 8, y1); + _mm256_storeu_ps(y + i + 16, y2); + _mm256_storeu_ps(y + i + 24, y3); + } + + // leftovers + for (int i = n32; i < n; ++i) { + y[i] += x[i]*v; + } +} +#endif + +void ggml_vec_mad_f32_avx(const int n, float *y, const float *x, const float v) +{ + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v4 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0); + y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1); + y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2); + y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3); + + _mm256_storeu_ps(y + i + 0, y0); + _mm256_storeu_ps(y + i + 8, y1); + _mm256_storeu_ps(y + i + 16, y2); + _mm256_storeu_ps(y + i + 24, y3); + } + + // leftovers + for (int i = n32; i < n; ++i) { + y[i] += x[i]*v; + } +} + +void ggml_vec_mad_f32_normal(const int n, float *y, const float *x, const float v) +{ + // scalar + for (int i = 0; i < n; ++i) { + y[i] += x[i]*v; + } +} + + +// inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) +#ifdef __AVX2__ +void ggml_vec_mad_f16_avx2(const int n, ggml_fp16_t *y, ggml_fp16_t *x, const float v) +{ + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v8 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_fmadd_ps(x0, v8, y0); + y1 = _mm256_fmadd_ps(x1, v8, y1); + y2 = _mm256_fmadd_ps(x2, v8, y2); + y3 = _mm256_fmadd_ps(x3, v8, y3); + + _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0)); + _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0)); + _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0)); + _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0)); + } + + // leftovers + for (int i = n32; i < n; ++i) { + GGML_ASSERT(false); + y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v); + } +} +#endif + +void ggml_vec_mad_f16_avx(const int n, ggml_fp16_t *y, ggml_fp16_t *x, const float v) +{ + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v8 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0); + y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1); + y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2); + y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3); + + _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0)); + _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0)); + _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0)); + _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0)); + } + + // leftovers + for (int i = n32; i < n; ++i) { + //GGML_ASSERT(false); + y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v); + } +} + +void ggml_vec_mad_f16_normal(const int n, ggml_fp16_t *y, ggml_fp16_t *x, const float v) +{ + for (int i = 0; i < n; ++i) { + y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v); + } +} + + +//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +#ifdef __AVX__ +inline static void ggml_vec_scale_f32_avx(const int n, float * y, const float v) +{ + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v4 = _mm256_set1_ps(v); + + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_mul_ps(y0, v4); + y1 = _mm256_mul_ps(y1, v4); + y2 = _mm256_mul_ps(y2, v4); + y3 = _mm256_mul_ps(y3, v4); + + _mm256_storeu_ps(y + i + 0, y0); + _mm256_storeu_ps(y + i + 8, y1); + _mm256_storeu_ps(y + i + 16, y2); + _mm256_storeu_ps(y + i + 24, y3); + } + + // leftovers + for (int i = n32; i < n; ++i) { + y[i] *= v; + } +} +#endif + +inline static void ggml_vec_scale_f32_normal(const int n, float * y, const float v) +{ + for (int i = 0; i < n; ++i) y[i] *= v; +} + +//inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrt(*s); } +//inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +//inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); } +//inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +//inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +//inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +//inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } + +//inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; } +//inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); } + +#ifdef __cplusplus +} +#endif + + +// stop watch class +class stopwatch { +public: + std::chrono::system_clock::time_point start; + std::chrono::system_clock::time_point now; + +stopwatch() +{ + start = std::chrono::system_clock::now(); +} + +double elapsed_ms() +{ + now = std::chrono::system_clock::now(); + double elapsed = static_cast(std::chrono::duration_cast(now - start).count() / 1000.0); + return elapsed; +} + +}; + + +// copy from z to Z +void copy(const int n, float *Z, const float *z) +{ + for (int i = 0; i < n; ++i) + { + Z[i] = z[i]; + } +} + +// converted copy from z to Z +void copy_fp32_to_fp16(const int n, ggml_fp16_t *Z, const float *z) +{ + for (int i = 0; i < n; ++i) + { + Z[i] = ggml_fp32_to_fp16(z[i]); + } +} + +// calc elapsed time in msec +template +double calc_for_ggml(T t, unsigned ms) +{ + stopwatch sw; + int cnt = 0; + volatile float sum = 0; //最適化で消されるの防止 + while(sw.elapsed_ms() < ms) + { + try + { + t(); + ++cnt; + } + catch (const char *s) + { + printf("catch: %s\n", s); + return 0; + } + } +#if 1 + return sw.elapsed_ms() / cnt; +#else + return cnt; +#endif +} + +// compare +template +bool compare(const char *func, T1 t1, T2 t2, const int n, const float *z, const float *w) +{ + t1(); + t2(); + + int max_diff = 5; + int n_diff = 0; + for (int i = 0; i < n; ++i) + { + if (std::fabs(z[i]- w[i]) > FLT_EPSILON) + { + printf("%s: different: i:%d z:%lf, w:%lf\n", func, i, z[i], w[i]); + ++n_diff; + if (n_diff == max_diff) + { + break; + } + } + } + + if (n_diff == 0) + { + printf("%s: same:%lf\n", func, z[0]); + return true; + } + + return false; +} + +template +bool compare(const char *func, T1 t1, T2 t2, const int n, const ggml_fp16_t *Z, const ggml_fp16_t *W) +{ + t1(); + t2(); + + int max_diff = 5; + int n_diff = 0; + for (int i = 0; i < n; ++i) + { + if (Z[i] != W[i]) + { + printf("%s: different: i:%d Z:%u, W:%u\n", func, i, Z[i], W[i]); + ++n_diff; + if (n_diff == max_diff) + { + break; + } + } + } + + if (n_diff == 0) + { + printf("%s: same:%u\n", func, Z[0]); + return true; + } + + return false; +} + + +int main(int argc, char** argv) +{ + const unsigned len_begin = 8; + const unsigned len_end = 512 * 1024; + const unsigned len_fact = 2; + //const unsigned run_ms = 250; + const unsigned run_ms = 500; + + // option + int verbose = 0; + long noalign = 0; // no align 32bytes address + int nocomp = 0; // no compare + int noperf = 0; // no performance test + + if (argc > 1) + { + for(int i = 1; i < argc; i++) + { + printf("arg:%d %s\n", i, argv[i]); + if (strcmp(argv[i], "-h") == 0) + { + printf("test-ggml: test performace and compare result of ggml AVX, AVX2 code\n"); + printf("usage: test-ggml [-h][-v][-na|--noalign][-l|--useload][-np|--noperf][-nc|--nocompare]\n"); + printf(" -h ... this message\n"); + printf(" -v ... verbose message\n"); + printf(" -na|--noalign ... use not-32bytes-aligned data\n"); + printf(" -np|--noperf ... no performance test\n"); + printf(" -nc|--nocompare ... no comparing normal results with AVX results\n"); + return 0; + } + if (strcmp(argv[i], "-v") == 0) + { + verbose++; + } + if ((strcmp(argv[i], "-na") == 0) || (strcmp(argv[i], "--noalign") == 0)) + { + noalign = 1; + } + if ((strcmp(argv[i], "-np") == 0) || (strcmp(argv[i], "--noperf") == 0)) + { + noperf = 1; + } + if ((strcmp(argv[i], "-nc") == 0) || (strcmp(argv[i], "--nocompare") == 0)) + { + nocomp = 1; + } + } + } + + // calc + std::mt19937 rng; + std::uniform_real_distribution<> dst(-1, 1); + + // label + printf("# noalign:%s\n", noalign ? "Y" : "N"); + if (!noperf) + { +#ifdef __AVX2__ + printf("func\tlen\tnormal\tavx\tavx2\tnormal/avx\tnormal/avx2\n"); +#else + printf("func\tlen\tnormal\tavx\tnormal/avx\n"); +#endif + } + + for(unsigned len = len_begin; len <= len_end; len *= len_fact) + { + // float + float *p1 = new __attribute__((aligned(32))) float[len + 9]; + float *p2 = new __attribute__((aligned(32))) float[len + 9]; + float *p3 = new __attribute__((aligned(32))) float[len + 9]; + float *p4 = new __attribute__((aligned(32))) float[len + 9]; + float *x = p1; + float *y = p2; + float *z = p3; + float *w = p4; // for compare with normal result + while(reinterpret_cast(x) % 32) ++x; + while(reinterpret_cast(y) % 32) ++y; + while(reinterpret_cast(z) % 32) ++z; + while(reinterpret_cast(w) % 32) ++w; + // unaligned + if (noalign != 0) + { + x = reinterpret_cast((reinterpret_cast(x) + 1)); + y = reinterpret_cast((reinterpret_cast(y) + 1)); + z = reinterpret_cast((reinterpret_cast(z) + 1)); + w = reinterpret_cast((reinterpret_cast(w) + 1)); + } + if (verbose > 0) printf("# float x:%p y:%p z:%p\n", x, y, z); + + std::generate(x, x + len, [&rng, &dst](){ return dst(rng); }); + std::generate(y, y + len, [&rng, &dst](){ return dst(rng); }); + + + // fp16 + ggml_fp16_t *p5 = new __attribute__((aligned(32))) ggml_fp16_t[len + 9]; + ggml_fp16_t *p6 = new __attribute__((aligned(32))) ggml_fp16_t[len + 9]; + ggml_fp16_t *p7 = new __attribute__((aligned(32))) ggml_fp16_t[len + 9]; + ggml_fp16_t *p8 = new __attribute__((aligned(32))) ggml_fp16_t[len + 9]; + ggml_fp16_t *X = p5; + ggml_fp16_t *Y = p6; + ggml_fp16_t *Z = p7; + ggml_fp16_t *W = p8; // for compare with normal result + while(reinterpret_cast(X) % 32) ++X; + while(reinterpret_cast(Y) % 32) ++Y; + while(reinterpret_cast(Z) % 32) ++Z; + while(reinterpret_cast(W) % 32) ++W; + // unaligned + if (noalign != 0) + { + X = reinterpret_cast((reinterpret_cast(X) + 1)); + Y = reinterpret_cast((reinterpret_cast(Y) + 1)); + Z = reinterpret_cast((reinterpret_cast(Z) + 1)); + W = reinterpret_cast((reinterpret_cast(W) + 1)); + } + if (verbose > 0) printf("# ggml_fp16_t X:%p Y:%p Z:%p\n", X, Y, Z); + + copy_fp32_to_fp16(len, X, x); + copy_fp32_to_fp16(len, Y, y); + + float v = y[0]; + float s; + float S; + int n = len; + + double tnormal = 0; + double tavx = 0; +#ifdef __F16C__ + double tf16c = 0; +#endif +#ifdef __AVX2__ + double tavx2 = 0; +#endif + + std::string func; + + // inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) + func = "ggml_vec_dot_f32"; + if (nocomp == 0) + { + compare(func.c_str(), [n, &S, x, y](){ return ggml_vec_dot_f32_normal(n, &S, x, y); }, [n, &s, x, y](){ return ggml_vec_dot_f32_avx(n, &s, x, y); }, 1, &s, &S); +#ifdef __AVX2__ + compare(func.c_str(), [n, &S, x, y](){ return ggml_vec_dot_f32_normal(n, &S, x, y); }, [n, &s, x, y](){ return ggml_vec_dot_f32_avx2(n, &s, #endif +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, &S, x, y](){ return ggml_vec_dot_f32_normal(n, &S, x, y); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + tavx = calc_for_ggml([n, &s, x, y](){ return ggml_vec_dot_f32_avx (n, &s, x, y); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); +x, y); }, 1, &s, &S); + tavx2 = calc_for_ggml([n, &s, x, y](){ return ggml_vec_dot_f32_avx2 (n, &s, x, y); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + // inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) + func = "ggml_vec_dot_f16"; + if (nocomp == 0) + { + compare(func.c_str(), [n, &S, X, Y](){ return ggml_vec_dot_f16_normal(n, &S, X, Y); }, [n, &s, X, Y](){ return ggml_vec_dot_f16_avx(n, &s, X, Y); }, 1, &s, &S); +#ifdef __AVX2__ + compare(func.c_str(), [n, &S, X, Y](){ return ggml_vec_dot_f16_normal(n, &S, X, Y); }, [n, &s, X, Y](){ return ggml_vec_dot_f16_avx2(n, &s, X, Y); }, 1, &s, &S); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, &S, X, Y](){ return ggml_vec_dot_f16_normal(n, &S, X, Y); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + tavx = calc_for_ggml([n, &s, X, Y](){ return ggml_vec_dot_f16_avx (n, &s, X, Y); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); + tavx2 = calc_for_ggml([n, &s, X, Y](){ return ggml_vec_dot_f16_avx2 (n, &s, X, Y); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + // inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) + func = "ggml_vec_mad_f32"; + copy(n, w, y); + copy(n, z, y); + if (nocomp == 0) + { + compare(func.c_str(), [n, w, x, v](){ return ggml_vec_mad_f32_normal(n, w, x, v); }, [n, z, x, v](){ return ggml_vec_mad_f32_avx(n, z, x, v); }, n, z, w); +#ifdef __AVX2__ + copy(n, w, y); + copy(n, z, y); + if (nocomp == 0) compare(func.c_str(), [n, w, x, v](){ return ggml_vec_mad_f32_normal(n, w, x, v); }, [n, z, x, v](){ return ggml_vec_mad_f32_avx2(n, z, x, v); }, n, z, w); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, w, x, v](){ return ggml_vec_mad_f32_normal(n, w, x, v); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + tavx = calc_for_ggml([n, z, x, v](){ return ggml_vec_mad_f32_avx (n, z, x, v); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); + tavx2 = calc_for_ggml([n, z, x, v](){ return ggml_vec_mad_f32_avx2 (n, z, x, v); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + // inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) + func = "ggml_vec_mad_f16"; + copy_fp32_to_fp16(n, W, y); + copy_fp32_to_fp16(n, Z, y); + if (nocomp == 0) + { + compare(func.c_str(), [n, W, X, v](){ return ggml_vec_mad_f16_normal(n, W, X, v); }, [n, Z, X, v](){ return ggml_vec_mad_f16_avx(n, Z, X, v); }, n, Z, W); +#ifdef __AVX2__ + copy_fp32_to_fp16(n, W, y); + copy_fp32_to_fp16(n, Z, y); + compare(func.c_str(), [n, W, X, v](){ return ggml_vec_mad_f16_normal(n, W, X, v); }, [n, Z, X, v](){ return ggml_vec_mad_f16_avx2(n, Z, X, v); }, n, Z, W); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + //copy_fp32_to_fp16(n, W, y); + tnormal = calc_for_ggml([n, W, X, v](){ return ggml_vec_mad_f16_normal(n, W, X, v); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + //copy_fp32_to_fp16(n, Z, y); + tavx = calc_for_ggml([n, Z, X, v](){ return ggml_vec_mad_f16_avx (n, Z, X, v); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); + //copy_fp32_to_fp16(n, Z, y); + tavx2 = calc_for_ggml([n, Z, X, v](){ return ggml_vec_mad_f16_avx2 (n, Z, X, v); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + //inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } + func = "ggml_vec_add_f32"; + if (nocomp == 0) + { + compare(func.c_str(), [n, w, x, y](){ return ggml_vec_add_f32_normal(n, w, x, y); }, [n, z, x, y](){ return ggml_vec_add_f32_avx(n, z, x, y); }, n, z, w); +#ifdef __AVX2__ + compare(func.c_str(), [n, w, x, y](){ return ggml_vec_add_f32_normal(n, w, x, y); }, [n, z, x, y](){ return ggml_vec_add_f32_avx2(n, z, x, y); }, n, z, w); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, w, x, y](){ return ggml_vec_add_f32_normal(n, w, x, y); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + tavx = calc_for_ggml([n, z, x, y](){ return ggml_vec_add_f32_avx (n, z, x, y); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); + tavx2 = calc_for_ggml([n, z, x, y](){ return ggml_vec_add_f32_avx2 (n, z, x, y); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } + func = "ggml_vec_scale_f32"; + copy(n, w, y); + copy(n, z, y); + if (nocomp == 0) + { + compare(func.c_str(), [n, w, v](){ return ggml_vec_scale_f32_normal(n, w, v); }, [n, z, v](){ return ggml_vec_scale_f32_avx(n, z, v); }, n, w, z); +#ifdef __AVX2__ + copy(n, w, y); + copy(n, z, y); + compare(func.c_str(), [n, w, v](){ return ggml_vec_scale_f32_normal(n, w, v); }, [n, z, v](){ return ggml_vec_scale_f32_avx2(n, z, v); }, n, w, z); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, w, v](){ return ggml_vec_scale_f32_normal(n, w, v); }, run_ms); + if (verbose > 0) printf("# start calc avx\n"); + tavx = calc_for_ggml([n, z, v](){ return ggml_vec_scale_f32_avx (n, z, v); }, run_ms); +#ifdef __AVX2__ + if (verbose > 0) printf("# start calc avx2\n"); + tavx2 = calc_for_ggml([n, z, v](){ return ggml_vec_scale_f32_avx2 (n, z, v); }, run_ms); +#endif +#ifdef __AVX2__ + printf("%s\t%d\t%lf\t%lf\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tavx2, tnormal/tavx, tnormal/tavx2); +#else + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tavx, tnormal/tavx); +#endif + } + + + //inline float ggml_fp16_to_fp32(ggml_fp16_t h) + //inline ggml_fp16_t ggml_fp32_to_fp16(float f) + func = "ggml_fp16_to_fp32"; + if (nocomp == 0) + { +#ifdef __F16C__ + compare(func.c_str(), [n, w, X](){ for (int i = 0; i < n; ++i) w[i] = ggml_fp16_to_fp32(X[i]); }, [n, z, X](){ for (int i = 0; i < n; ++i) z[i] = ggml_fp16_to_fp32_f16c(X[i]); }, n, z, w); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, w, X](){ for (int i = 0; i < n; ++i) w[i] = ggml_fp16_to_fp32(X[i]); }, run_ms); +#ifdef __F16C__ + if (verbose > 0) printf("# start calc f16c\n"); + tf16c = calc_for_ggml([n, z, X](){ for (int i = 0; i < n; ++i) z[i] = ggml_fp16_to_fp32_f16c(X[i]); }, run_ms); +#endif +#ifdef __F16C__ + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tf16c, tnormal/tf16c); +#else + printf("%s\t%d\t%lf\n", func.c_str(), n, tnormal); +#endif + } + + //inline float ggml_fp16_to_fp32(ggml_fp16_t h) + //inline ggml_fp16_t ggml_fp32_to_fp16(float f) + func = "ggml_fp32_to_fp16"; + if (nocomp == 0) + { +#ifdef __F16C__ + compare(func.c_str(), [n, W, x](){ for (int i = 0; i < n; ++i) W[i] = ggml_fp32_to_fp16(x[i]); }, [n, Z, x](){ for (int i = 0; i < n; ++i) Z[i] = ggml_fp32_to_fp16_f16c(x[i]); }, n, Z, W); +#endif + } + if (noperf == 0) + { + if (verbose > 0) printf("# start calc normal\n"); + tnormal = calc_for_ggml([n, W, x](){ for (int i = 0; i < n; ++i) W[i] = ggml_fp32_to_fp16(x[i]); }, run_ms); +#ifdef __F16C__ + if (verbose > 0) printf("# start calc f16c\n"); + tf16c = calc_for_ggml([n, Z, x](){ for (int i = 0; i < n; ++i) Z[i] = ggml_fp32_to_fp16_f16c(x[i]); }, run_ms); +#endif +#ifdef __F16C__ + printf("%s\t%d\t%lf\t%lf\t%lf\n", func.c_str(), n, tnormal, tf16c, tnormal/tf16c); +#else + printf("%s\t%d\t%lf\n", func.c_str(), n, tnormal); +#endif + } + + delete[] p1; + delete[] p2; + delete[] p3; + delete[] p4; + delete[] p5; + delete[] p6; + delete[] p7; + delete[] p8; + } + + return 0; +} +