diff --git a/tests/test-mul-mat1.c b/tests/test-mul-mat1.c index 9a84416..305accf 100644 --- a/tests/test-mul-mat1.c +++ b/tests/test-mul-mat1.c @@ -26,7 +26,7 @@ uint64_t get_time_us() { // naive implementation // -void mul_mat_vec_f32_0( +void mul_mat_f32_0( const float * restrict src0, // M x K const float * restrict src1, // N x K (transposed) float * dst, @@ -42,7 +42,7 @@ void mul_mat_vec_f32_0( } } -void mul_mat_vec_f16_0( +void mul_mat_f16_0( const __fp16 * src0, const __fp16 * src1, float * dst, @@ -108,7 +108,7 @@ void mul_mat_vec_f16_0( } // blocking with block size 32 -void mul_mat_vec_f16_1( +void mul_mat_f16_1( const __fp16 * src0, const __fp16 * src1, float * dst, @@ -180,7 +180,7 @@ void mul_mat_vec_f16_1( } -void mul_mat_vec_f8_0( +void mul_mat_f8_0( const uint8_t * src0, const uint8_t * src1, float * dst, @@ -258,7 +258,7 @@ int main(int argc, const char ** argv) { method = atoi(argv[1]); } - const int nIter = 10000; + const int nIter = 1; const clock_t start = clock(); const uint64_t start_us = get_time_us(); @@ -267,19 +267,19 @@ int main(int argc, const char ** argv) { double sum = 0.0f; for (int i = 0; i < nIter; i++) { if (method == 0) { - mul_mat_vec_f32_0(src0, src1, dst, M, N, K); + mul_mat_f32_0(src0, src1, dst, M, N, K); } if (method == 1) { - mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, M, N, K); + mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K); } if (method == 2) { - mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, M, N, K); + mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K); } if (method == 3) { - mul_mat_vec_f8_0(src0_fp8, src1_fp8, dst, M, N, K); + mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K); } if (method == 4) { diff --git a/tests/test-mul-mat2.c b/tests/test-mul-mat2.c index bfc82cc..bb7dd8d 100644 --- a/tests/test-mul-mat2.c +++ b/tests/test-mul-mat2.c @@ -27,9 +27,21 @@ const int N = 1536; const int K = 1280; const int QK = 64; -const int QB = 7; +#define QB 7 -#define gq_t uint64_t +//#define GGML_GQ_USE_FP16_SCALE + +#if defined(GGML_GQ_USE_FP16_SCALE) +#define gq_scale_t ggml_fp16_t +#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x) +#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x) +#else +#define gq_scale_t float +#define GGML_FP32_TO_GQ(x) (x) +#define GGML_GQ_TO_FP32(x) (x) +#endif + +#define gq_quant_t uint64_t #define gq_t_bits 64 uint64_t get_time_us() { @@ -42,7 +54,7 @@ uint64_t get_time_us() { // naive implementation // -void mul_mat_vec_f32_naive( +void mul_mat_f32_naive( const float * restrict src0, // M x K const float * restrict src1, // N x K (transposed) float * dst, @@ -65,7 +77,7 @@ void mul_mat_vec_f32_naive( void quantize_1(const float * src, void * dst, int n, int k) { char * p0 = dst; - gq_t pp[QB]; + gq_quant_t pp[QB]; for (int j = 0; j < n; j++) { for (int i = 0; i < k/QK; i++) { @@ -120,19 +132,19 @@ void quantize_1(const float * src, void * dst, int n, int k) { const uint8_t q = (v - min)*id; for (int b = 0; b < QB; b++) { - pp[b] |= q & (1 << b) ? (1LL << l) : 0; + pp[b] |= q & (1 << b) ? (1ULL << l) : 0; } } for (int b = 0; b < QB; b++) { - memcpy(p0, &pp[b], sizeof(gq_t)); p0 += sizeof(gq_t); + memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t); } } } } } -void mul_mat_vec_gq_1( +void mul_mat_gq_1( const void * src0, const void * src1, float * dst, @@ -145,15 +157,15 @@ void mul_mat_vec_gq_1( float s0[QB + 1]; float s1[QB + 1]; - gq_t m0[QB + 1]; - gq_t m1[QB + 1]; + gq_quant_t m0[QB + 1]; + gq_quant_t m1[QB + 1]; for (int ir0 = 0; ir0 < m; ir0++) { for (int ir1 = 0; ir1 < n; ir1++) { float sumf = 0.0; - const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); - const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); + const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK)); + const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK)); for (int i = 0; i < kp/QK; i++) { float min0, d0; @@ -177,13 +189,13 @@ void mul_mat_vec_gq_1( s1[b + 1] = d1*(1 << b); } - m0[0] = -1LL; - m1[0] = -1LL; + m0[0] = -1ULL; + m1[0] = -1ULL; for (int s = 0; s < QK/gq_t_bits; ++s) { for (int b = 0; b < QB; b++) { - memcpy(&m0[b + 1], pp0, sizeof(gq_t)); pp0 += sizeof(gq_t); - memcpy(&m1[b + 1], pp1, sizeof(gq_t)); pp1 += sizeof(gq_t); + memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t); + memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t); } for (int q0 = 0; q0 < QB + 1; q0++) { @@ -205,147 +217,175 @@ void mul_mat_vec_gq_1( // method 2 // -void quantize_2(const float * src, void * dst, int n, int k) { - char * p0 = dst; +static inline int quantize_2_blocks_per_row(int k) { + return k/QK; +} - gq_t pp[QB]; +static inline int quantize_2_quants_per_block() { + return QK/gq_t_bits; +} - for (int j = 0; j < n; j++) { - for (int i = 0; i < k/QK; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; +static inline int quantize_2_row_size(int k) { + const int nb = quantize_2_blocks_per_row(k); + const int nq = quantize_2_quants_per_block(); - // find min/max -#ifdef __ARM_NEON - { - float32x4_t minv = vdupq_n_f32(FLT_MAX); - float32x4_t maxv = vdupq_n_f32(-FLT_MAX); + return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t)); +} - for (int l = 0; l < QK; l += 4) { - float32x4_t v = vld1q_f32(src + j*k + i*QK + l); - minv = vminq_f32(minv, v); - maxv = vmaxq_f32(maxv, v); - } +void quantize_2_row(const float * restrict src, void * restrict dst, int k) { + assert(k % QK == 0); - float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv)); - float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv)); + const int nb = quantize_2_blocks_per_row(k); + const int nq = quantize_2_quants_per_block(); - min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1)); - max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1)); + gq_scale_t * restrict pm = (gq_scale_t *) (dst); + gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb); + gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb); - //printf("SIMD min/max: %f %f\n", min, max); - } -#else - { - for (int l = 0; l < QK; l++) { - const float v = src[j*k + i*QK + l]; - if (v < min) min = v; - if (v > max) max = v; - } + gq_quant_t pp[QB]; - //printf("NORM min/max: %f %f\n", min, max); - } -#endif + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; - const float d = (max - min) / ((1 << QB) - 1); - const float id = d ? 1.0/d : 0.0; - - memcpy(p0, &min, sizeof(float)); p0 += sizeof(float); - memcpy(p0, &d, sizeof(float)); p0 += sizeof(float); + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]; + if (v < min) min = v; + if (v > max) max = v; + } - //printf("min/max/d/id: %f %f %f %f\n", min, max, d, id); + const float d = (max - min) / ((1 << QB) - 1); + const float id = d ? 1.0/d : 0.0; - for (int s = 0; s < QK/gq_t_bits; ++s) { - memset(pp, 0, sizeof(pp)); + pm[i] = GGML_FP32_TO_GQ(min); + pd[i] = GGML_FP32_TO_GQ(d); - for (int l = 0; l < gq_t_bits; l++) { - const float v = src[j*k + i*QK + s*gq_t_bits + l]; - const uint8_t q = (v - min)*id; + for (int s = 0; s < nq; ++s) { + memset(pp, 0, sizeof(pp)); - for (int b = 0; b < QB; b++) { - pp[b] |= q & (1 << b) ? (1LL << l) : 0; - } - } + for (int l = 0; l < gq_t_bits; l++) { + const float v = src[i*QK + s*gq_t_bits + l]; + const uint8_t q = (v - min)*id; for (int b = 0; b < QB; b++) { - memcpy(p0, &pp[b], sizeof(gq_t)); p0 += sizeof(gq_t); + pp[b] |= q & (1 << b) ? (1ULL << l) : 0; } } + + for (int b = 0; b < QB; b++) { + pb[i*nq*QB + s*QB + b] = pp[b]; + } } } } -void mul_mat_vec_gq_2( - const void * src0, - const void * src1, - float * dst, - int m, int n, int k) { - const int kp = k & ~(gq_t_bits - 1); - - const char * restrict p0 = src0; - const char * restrict p1 = src1; +// reimplementation of quantize_2 using quantize_2_row +void quantize_2(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % QK == 0); - float s0[QB + 1]; - float s1[QB + 1]; + for (int j = 0; j < n; j++) { + quantize_2_row(src + j*k, dst, k); + dst = (char *) dst + quantize_2_row_size(k); + } +} - gq_t m0[QB + 1]; - gq_t m1[QB + 1]; +void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + float sumf[(QB + 1)*(QB + 1)]; + memset(sumf, 0, sizeof(sumf)); - for (int ir0 = 0; ir0 < m; ir0++) { - for (int ir1 = 0; ir1 < n; ir1++) { - float sumf = 0.0; + const int nb = quantize_2_blocks_per_row(n); + const int nq = quantize_2_quants_per_block(); - const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); - const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); + const gq_scale_t * restrict pm0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pm1 = (const gq_scale_t *) y; - for (int i = 0; i < kp/QK; i++) { - float min0, d0; - memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float); - memcpy(&d0, pp0, sizeof(float)); pp0 += sizeof(float); + const gq_scale_t * restrict pd0 = pm0 + nb; + const gq_scale_t * restrict pd1 = pm1 + nb; - float min1, d1; - memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float); - memcpy(&d1, pp1, sizeof(float)); pp1 += sizeof(float); - - //printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1); + const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb); + const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb); #if 1 - // >>> General case for any QB + float s0[QB + 1]; + float s1[QB + 1]; - s0[0] = min0; - s1[0] = min1; + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); - for (int b = 0; b < QB; b++) { - s0[b + 1] = d0*(1 << b); - s1[b + 1] = d1*(1 << b); - } + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); - m0[0] = -1LL; - m1[0] = -1LL; + s0[0] = m0; + s1[0] = m1; - for (int s = 0; s < QK/gq_t_bits; ++s) { - for (int b = 0; b < QB; b++) { - memcpy(&m0[b + 1], pp0, sizeof(gq_t)); pp0 += sizeof(gq_t); - memcpy(&m1[b + 1], pp1, sizeof(gq_t)); pp1 += sizeof(gq_t); - } + for (int b = 0; b < QB; b++) { + s0[b + 1] = d0*(1 << b); + s1[b + 1] = d1*(1 << b); + } - for (int q0 = 0; q0 < QB + 1; q0++) { - for (int q1 = 0; q1 < QB + 1; q1++) { - sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]); - } - } + for (int s = 0; s < nq; ++s) { + for (int q0 = 0; q0 < QB + 1; q0++) { + const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL; + for (int q1 = 0; q1 < QB + 1; q1++) { + const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL; + sumf[q0*(QB + 1) + q1] += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1); } + } + } + } +#else + // SIMD-ify with the assumptions: + // - nb is a multiple of 4 + // - gq_scale_t is float + // - gq_quant_t is uint64_t + // - QB == 7 + assert(nb % 4 == 0); + +#ifdef __ARM_NEON #else + // TODO #endif - } - dst[ir0*n + ir1] = sumf; +#endif + + for (int q0 = 0; q0 < QB + 1; q0++) { + for (int q1 = 1; q1 < QB + 1; q1++) { + sumf[q0*(QB + 1)] += sumf[q0*(QB + 1) + q1]; } } + + *s = sumf[0]; + for (int q0 = 1; q0 < QB + 1; q0++) { + *s += sumf[q0*(QB + 1)]; + } +} + +// use vec_dot_gq_2 to compute the dot product of two rows +void mul_mat_gq_2( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % QK == 0); + + const int nb = quantize_2_blocks_per_row(k); + const int nq = quantize_2_quants_per_block(); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_2(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_2_row_size(k); + } + src0 = (const char *) src0 + quantize_2_row_size(k); + src1 = (const char *) src1 - n*quantize_2_row_size(k); + + dst = (float *) dst + n; + } } int main(int argc, const char ** argv) { - assert(sizeof(gq_t)*8 == gq_t_bits); + assert(sizeof(gq_quant_t)*8 == gq_t_bits); float * src0 = (float *)malloc(sizeof(float)*M*K); float * src1 = (float *)malloc(sizeof(float)*N*K); @@ -359,12 +399,11 @@ int main(int argc, const char ** argv) { src1[i] = rand() / (float)RAND_MAX; } - void * src0_gq = calloc(1, (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*M); - void * src1_gq = calloc(1, (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*N); + void * src0_gq = calloc(1, quantize_2_row_size(K)*M); + void * src1_gq = calloc(1, quantize_2_row_size(K)*N); const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K; - const size_t sizegq = (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*M + - (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*N; + const size_t sizegq = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N; printf("compression: %f\n", (float)sizegq/sizef16); @@ -400,15 +439,15 @@ int main(int argc, const char ** argv) { double sum = 0.0f; for (int i = 0; i < nIter; i++) { if (method == 0) { - mul_mat_vec_f32_naive(src0, src1, dst, M, N, K); + mul_mat_f32_naive(src0, src1, dst, M, N, K); } if (method == 1) { - mul_mat_vec_gq_1(src0_gq, src1_gq, dst, M, N, K); + mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K); } if (method == 2) { - mul_mat_vec_gq_1(src0_gq, src1_gq, dst, M, N, K); + mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K); } }