tests : some more quantization experiments

pull/15/head
Georgi Gerganov 1 year ago
parent e0abac1be7
commit 73a7916d30
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -26,7 +26,7 @@ uint64_t get_time_us() {
// naive implementation // naive implementation
// //
void mul_mat_vec_f32_0( void mul_mat_f32_0(
const float * restrict src0, // M x K const float * restrict src0, // M x K
const float * restrict src1, // N x K (transposed) const float * restrict src1, // N x K (transposed)
float * dst, float * dst,
@ -42,7 +42,7 @@ void mul_mat_vec_f32_0(
} }
} }
void mul_mat_vec_f16_0( void mul_mat_f16_0(
const __fp16 * src0, const __fp16 * src0,
const __fp16 * src1, const __fp16 * src1,
float * dst, float * dst,
@ -108,7 +108,7 @@ void mul_mat_vec_f16_0(
} }
// blocking with block size 32 // blocking with block size 32
void mul_mat_vec_f16_1( void mul_mat_f16_1(
const __fp16 * src0, const __fp16 * src0,
const __fp16 * src1, const __fp16 * src1,
float * dst, float * dst,
@ -180,7 +180,7 @@ void mul_mat_vec_f16_1(
} }
void mul_mat_vec_f8_0( void mul_mat_f8_0(
const uint8_t * src0, const uint8_t * src0,
const uint8_t * src1, const uint8_t * src1,
float * dst, float * dst,
@ -258,7 +258,7 @@ int main(int argc, const char ** argv) {
method = atoi(argv[1]); method = atoi(argv[1]);
} }
const int nIter = 10000; const int nIter = 1;
const clock_t start = clock(); const clock_t start = clock();
const uint64_t start_us = get_time_us(); const uint64_t start_us = get_time_us();
@ -267,19 +267,19 @@ int main(int argc, const char ** argv) {
double sum = 0.0f; double sum = 0.0f;
for (int i = 0; i < nIter; i++) { for (int i = 0; i < nIter; i++) {
if (method == 0) { if (method == 0) {
mul_mat_vec_f32_0(src0, src1, dst, M, N, K); mul_mat_f32_0(src0, src1, dst, M, N, K);
} }
if (method == 1) { if (method == 1) {
mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, M, N, K); mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
} }
if (method == 2) { if (method == 2) {
mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, M, N, K); mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
} }
if (method == 3) { if (method == 3) {
mul_mat_vec_f8_0(src0_fp8, src1_fp8, dst, M, N, K); mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
} }
if (method == 4) { if (method == 4) {

@ -27,9 +27,21 @@ const int N = 1536;
const int K = 1280; const int K = 1280;
const int QK = 64; const int QK = 64;
const int QB = 7; #define QB 7
#define gq_t uint64_t //#define GGML_GQ_USE_FP16_SCALE
#if defined(GGML_GQ_USE_FP16_SCALE)
#define gq_scale_t ggml_fp16_t
#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
#else
#define gq_scale_t float
#define GGML_FP32_TO_GQ(x) (x)
#define GGML_GQ_TO_FP32(x) (x)
#endif
#define gq_quant_t uint64_t
#define gq_t_bits 64 #define gq_t_bits 64
uint64_t get_time_us() { uint64_t get_time_us() {
@ -42,7 +54,7 @@ uint64_t get_time_us() {
// naive implementation // naive implementation
// //
void mul_mat_vec_f32_naive( void mul_mat_f32_naive(
const float * restrict src0, // M x K const float * restrict src0, // M x K
const float * restrict src1, // N x K (transposed) const float * restrict src1, // N x K (transposed)
float * dst, float * dst,
@ -65,7 +77,7 @@ void mul_mat_vec_f32_naive(
void quantize_1(const float * src, void * dst, int n, int k) { void quantize_1(const float * src, void * dst, int n, int k) {
char * p0 = dst; char * p0 = dst;
gq_t pp[QB]; gq_quant_t pp[QB];
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
for (int i = 0; i < k/QK; i++) { for (int i = 0; i < k/QK; i++) {
@ -120,19 +132,19 @@ void quantize_1(const float * src, void * dst, int n, int k) {
const uint8_t q = (v - min)*id; const uint8_t q = (v - min)*id;
for (int b = 0; b < QB; b++) { for (int b = 0; b < QB; b++) {
pp[b] |= q & (1 << b) ? (1LL << l) : 0; pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
} }
} }
for (int b = 0; b < QB; b++) { for (int b = 0; b < QB; b++) {
memcpy(p0, &pp[b], sizeof(gq_t)); p0 += sizeof(gq_t); memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
} }
} }
} }
} }
} }
void mul_mat_vec_gq_1( void mul_mat_gq_1(
const void * src0, const void * src0,
const void * src1, const void * src1,
float * dst, float * dst,
@ -145,15 +157,15 @@ void mul_mat_vec_gq_1(
float s0[QB + 1]; float s0[QB + 1];
float s1[QB + 1]; float s1[QB + 1];
gq_t m0[QB + 1]; gq_quant_t m0[QB + 1];
gq_t m1[QB + 1]; gq_quant_t m1[QB + 1];
for (int ir0 = 0; ir0 < m; ir0++) { for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) { for (int ir1 = 0; ir1 < n; ir1++) {
float sumf = 0.0; float sumf = 0.0;
const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
for (int i = 0; i < kp/QK; i++) { for (int i = 0; i < kp/QK; i++) {
float min0, d0; float min0, d0;
@ -177,13 +189,13 @@ void mul_mat_vec_gq_1(
s1[b + 1] = d1*(1 << b); s1[b + 1] = d1*(1 << b);
} }
m0[0] = -1LL; m0[0] = -1ULL;
m1[0] = -1LL; m1[0] = -1ULL;
for (int s = 0; s < QK/gq_t_bits; ++s) { for (int s = 0; s < QK/gq_t_bits; ++s) {
for (int b = 0; b < QB; b++) { for (int b = 0; b < QB; b++) {
memcpy(&m0[b + 1], pp0, sizeof(gq_t)); pp0 += sizeof(gq_t); memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
memcpy(&m1[b + 1], pp1, sizeof(gq_t)); pp1 += sizeof(gq_t); memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
} }
for (int q0 = 0; q0 < QB + 1; q0++) { for (int q0 = 0; q0 < QB + 1; q0++) {
@ -205,147 +217,175 @@ void mul_mat_vec_gq_1(
// method 2 // method 2
// //
void quantize_2(const float * src, void * dst, int n, int k) { static inline int quantize_2_blocks_per_row(int k) {
char * p0 = dst; return k/QK;
}
gq_t pp[QB]; static inline int quantize_2_quants_per_block() {
return QK/gq_t_bits;
}
for (int j = 0; j < n; j++) { static inline int quantize_2_row_size(int k) {
for (int i = 0; i < k/QK; i++) { const int nb = quantize_2_blocks_per_row(k);
float min = FLT_MAX; const int nq = quantize_2_quants_per_block();
float max = -FLT_MAX;
// find min/max return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
#ifdef __ARM_NEON }
{
float32x4_t minv = vdupq_n_f32(FLT_MAX);
float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
for (int l = 0; l < QK; l += 4) { void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
float32x4_t v = vld1q_f32(src + j*k + i*QK + l); assert(k % QK == 0);
minv = vminq_f32(minv, v);
maxv = vmaxq_f32(maxv, v);
}
float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv)); const int nb = quantize_2_blocks_per_row(k);
float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv)); const int nq = quantize_2_quants_per_block();
min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1)); gq_scale_t * restrict pm = (gq_scale_t *) (dst);
max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1)); gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
//printf("SIMD min/max: %f %f\n", min, max); gq_quant_t pp[QB];
}
#else
{
for (int l = 0; l < QK; l++) {
const float v = src[j*k + i*QK + l];
if (v < min) min = v;
if (v > max) max = v;
}
//printf("NORM min/max: %f %f\n", min, max); for (int i = 0; i < nb; i++) {
} float min = FLT_MAX;
#endif float max = -FLT_MAX;
const float d = (max - min) / ((1 << QB) - 1); for (int l = 0; l < QK; l++) {
const float id = d ? 1.0/d : 0.0; const float v = src[i*QK + l];
if (v < min) min = v;
memcpy(p0, &min, sizeof(float)); p0 += sizeof(float); if (v > max) max = v;
memcpy(p0, &d, sizeof(float)); p0 += sizeof(float); }
//printf("min/max/d/id: %f %f %f %f\n", min, max, d, id); const float d = (max - min) / ((1 << QB) - 1);
const float id = d ? 1.0/d : 0.0;
for (int s = 0; s < QK/gq_t_bits; ++s) { pm[i] = GGML_FP32_TO_GQ(min);
memset(pp, 0, sizeof(pp)); pd[i] = GGML_FP32_TO_GQ(d);
for (int l = 0; l < gq_t_bits; l++) { for (int s = 0; s < nq; ++s) {
const float v = src[j*k + i*QK + s*gq_t_bits + l]; memset(pp, 0, sizeof(pp));
const uint8_t q = (v - min)*id;
for (int b = 0; b < QB; b++) { for (int l = 0; l < gq_t_bits; l++) {
pp[b] |= q & (1 << b) ? (1LL << l) : 0; const float v = src[i*QK + s*gq_t_bits + l];
} const uint8_t q = (v - min)*id;
}
for (int b = 0; b < QB; b++) { for (int b = 0; b < QB; b++) {
memcpy(p0, &pp[b], sizeof(gq_t)); p0 += sizeof(gq_t); pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
} }
} }
for (int b = 0; b < QB; b++) {
pb[i*nq*QB + s*QB + b] = pp[b];
}
} }
} }
} }
void mul_mat_vec_gq_2( // reimplementation of quantize_2 using quantize_2_row
const void * src0, void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
const void * src1, assert(k % QK == 0);
float * dst,
int m, int n, int k) {
const int kp = k & ~(gq_t_bits - 1);
const char * restrict p0 = src0;
const char * restrict p1 = src1;
float s0[QB + 1]; for (int j = 0; j < n; j++) {
float s1[QB + 1]; quantize_2_row(src + j*k, dst, k);
dst = (char *) dst + quantize_2_row_size(k);
}
}
gq_t m0[QB + 1]; void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
gq_t m1[QB + 1]; float sumf[(QB + 1)*(QB + 1)];
memset(sumf, 0, sizeof(sumf));
for (int ir0 = 0; ir0 < m; ir0++) { const int nb = quantize_2_blocks_per_row(n);
for (int ir1 = 0; ir1 < n; ir1++) { const int nq = quantize_2_quants_per_block();
float sumf = 0.0;
const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(k/QK)); const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
for (int i = 0; i < kp/QK; i++) { const gq_scale_t * restrict pd0 = pm0 + nb;
float min0, d0; const gq_scale_t * restrict pd1 = pm1 + nb;
memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
memcpy(&d0, pp0, sizeof(float)); pp0 += sizeof(float);
float min1, d1; const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float); const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
memcpy(&d1, pp1, sizeof(float)); pp1 += sizeof(float);
//printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
#if 1 #if 1
// >>> General case for any QB float s0[QB + 1];
float s1[QB + 1];
s0[0] = min0; for (int i = 0; i < nb; i++) {
s1[0] = min1; const float m0 = GGML_GQ_TO_FP32(pm0[i]);
const float d0 = GGML_GQ_TO_FP32(pd0[i]);
for (int b = 0; b < QB; b++) { const float m1 = GGML_GQ_TO_FP32(pm1[i]);
s0[b + 1] = d0*(1 << b); const float d1 = GGML_GQ_TO_FP32(pd1[i]);
s1[b + 1] = d1*(1 << b);
}
m0[0] = -1LL; s0[0] = m0;
m1[0] = -1LL; s1[0] = m1;
for (int s = 0; s < QK/gq_t_bits; ++s) { for (int b = 0; b < QB; b++) {
for (int b = 0; b < QB; b++) { s0[b + 1] = d0*(1 << b);
memcpy(&m0[b + 1], pp0, sizeof(gq_t)); pp0 += sizeof(gq_t); s1[b + 1] = d1*(1 << b);
memcpy(&m1[b + 1], pp1, sizeof(gq_t)); pp1 += sizeof(gq_t); }
}
for (int q0 = 0; q0 < QB + 1; q0++) { for (int s = 0; s < nq; ++s) {
for (int q1 = 0; q1 < QB + 1; q1++) { for (int q0 = 0; q0 < QB + 1; q0++) {
sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]); const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
} for (int q1 = 0; q1 < QB + 1; q1++) {
} const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
sumf[q0*(QB + 1) + q1] += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
} }
}
}
}
#else
// SIMD-ify with the assumptions:
// - nb is a multiple of 4
// - gq_scale_t is float
// - gq_quant_t is uint64_t
// - QB == 7
assert(nb % 4 == 0);
#ifdef __ARM_NEON
#else #else
// TODO
#endif #endif
}
dst[ir0*n + ir1] = sumf; #endif
for (int q0 = 0; q0 < QB + 1; q0++) {
for (int q1 = 1; q1 < QB + 1; q1++) {
sumf[q0*(QB + 1)] += sumf[q0*(QB + 1) + q1];
} }
} }
*s = sumf[0];
for (int q0 = 1; q0 < QB + 1; q0++) {
*s += sumf[q0*(QB + 1)];
}
}
// use vec_dot_gq_2 to compute the dot product of two rows
void mul_mat_gq_2(
const void * src0,
const void * src1, // transposed
float * dst,
int m, int n, int k) {
assert(k % QK == 0);
const int nb = quantize_2_blocks_per_row(k);
const int nq = quantize_2_quants_per_block();
for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) {
vec_dot_gq_2(k, dst + ir1, src0, src1);
src1 = (const char *) src1 + quantize_2_row_size(k);
}
src0 = (const char *) src0 + quantize_2_row_size(k);
src1 = (const char *) src1 - n*quantize_2_row_size(k);
dst = (float *) dst + n;
}
} }
int main(int argc, const char ** argv) { int main(int argc, const char ** argv) {
assert(sizeof(gq_t)*8 == gq_t_bits); assert(sizeof(gq_quant_t)*8 == gq_t_bits);
float * src0 = (float *)malloc(sizeof(float)*M*K); float * src0 = (float *)malloc(sizeof(float)*M*K);
float * src1 = (float *)malloc(sizeof(float)*N*K); float * src1 = (float *)malloc(sizeof(float)*N*K);
@ -359,12 +399,11 @@ int main(int argc, const char ** argv) {
src1[i] = rand() / (float)RAND_MAX; src1[i] = rand() / (float)RAND_MAX;
} }
void * src0_gq = calloc(1, (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*M); void * src0_gq = calloc(1, quantize_2_row_size(K)*M);
void * src1_gq = calloc(1, (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*N); void * src1_gq = calloc(1, quantize_2_row_size(K)*N);
const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K; const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
const size_t sizegq = (2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*M + const size_t sizegq = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
(2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_t))*(K/QK)*N;
printf("compression: %f\n", (float)sizegq/sizef16); printf("compression: %f\n", (float)sizegq/sizef16);
@ -400,15 +439,15 @@ int main(int argc, const char ** argv) {
double sum = 0.0f; double sum = 0.0f;
for (int i = 0; i < nIter; i++) { for (int i = 0; i < nIter; i++) {
if (method == 0) { if (method == 0) {
mul_mat_vec_f32_naive(src0, src1, dst, M, N, K); mul_mat_f32_naive(src0, src1, dst, M, N, K);
} }
if (method == 1) { if (method == 1) {
mul_mat_vec_gq_1(src0_gq, src1_gq, dst, M, N, K); mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
} }
if (method == 2) { if (method == 2) {
mul_mat_vec_gq_1(src0_gq, src1_gq, dst, M, N, K); mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
} }
} }

Loading…
Cancel
Save