diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp index c9cfddd..ee9b493 100644 --- a/examples/gpt-2/quantize.cpp +++ b/examples/gpt-2/quantize.cpp @@ -12,112 +12,9 @@ #include #include +// TODO: move somewhere else #define QK 32 -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pd = (float *) (pdst + (j/k)*row_size); - uint8_t * pb = (uint8_t *) (pd + nb); - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - amax = std::max(amax, fabsf(v)); - } - - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0])*id; - const float v1 = (src[j + i*QK + l + 1])*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); - - //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); - - for (int i = 0; i < nb; i++) { - float min = std::numeric_limits::max(); - float max = std::numeric_limits::min(); - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pm[i] = min; - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0] - min)*id; - const float v1 = (src[j + i*QK + l + 1] - min)*id; - - const uint8_t vi0 = round(v0); - const uint8_t vi1 = round(v1); - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - // default hparams (GPT-2 117M) struct gpt2_hparams { int32_t n_vocab = 50257; @@ -235,6 +132,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam std::vector data_f16; std::vector data_f32; + std::vector hist_all(1 << 4, 0); + while (true) { int32_t n_dims; int32_t length; @@ -319,15 +218,16 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam work.resize(nelements); // for quantization size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; default: { @@ -339,7 +239,15 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam fout.write(reinterpret_cast(work.data()), cur_size); total_size_new += cur_size; - printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); } else { printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); @@ -351,6 +259,19 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + { + int64_t sum_all = 0; + for (int i = 0; i < hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } } finp.close(); diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp index 8597739..ff54fea 100644 --- a/examples/gpt-j/quantize.cpp +++ b/examples/gpt-j/quantize.cpp @@ -12,112 +12,9 @@ #include #include +// TODO: move somewhere else #define QK 32 -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pd = (float *) (pdst + (j/k)*row_size); - uint8_t * pb = (uint8_t *) (pd + nb); - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - amax = std::max(amax, fabsf(v)); - } - - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0])*id; - const float v1 = (src[j + i*QK + l + 1])*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); - - //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); - - for (int i = 0; i < nb; i++) { - float min = std::numeric_limits::max(); - float max = std::numeric_limits::min(); - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pm[i] = min; - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0] - min)*id; - const float v1 = (src[j + i*QK + l + 1] - min)*id; - - const uint8_t vi0 = round(v0); - const uint8_t vi1 = round(v1); - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - // default hparams (GPT-J 6B) struct gptj_hparams { int32_t n_vocab = 50400; @@ -238,6 +135,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam std::vector data_f16; std::vector data_f32; + std::vector hist_all(1 << 4, 0); + while (true) { int32_t n_dims; int32_t length; @@ -321,15 +220,16 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam work.resize(nelements); // for quantization size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; default: { @@ -341,7 +241,15 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam fout.write(reinterpret_cast(work.data()), cur_size); total_size_new += cur_size; - printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); } else { printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); @@ -353,6 +261,19 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + { + int64_t sum_all = 0; + for (int i = 0; i < hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } } finp.close(); diff --git a/examples/utils.cpp b/examples/utils.cpp index 30057b7..402a1fd 100644 --- a/examples/utils.cpp +++ b/examples/utils.cpp @@ -328,3 +328,113 @@ gpt_vocab::id gpt_sample_top_k_top_p( return logits_id[idx].second; } + +size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { + const int nb = k / qk; + const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*qk/2); + + assert(k % qk == 0); + + uint8_t pp[qk/2]; + + char * pdst = (char *) dst; + + for (int j = 0; j < n; j += k) { + float * pd = (float *) (pdst + (j/k)*row_size); + uint8_t * pb = (uint8_t *) (pd + nb); + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + { + for (int l = 0; l < qk; l++) { + const float v = src[j + i*qk + l]; + amax = std::max(amax, fabsf(v)); + } + + const float d = amax / ((1 << 3) - 1); + const float id = d ? 1.0f/d : 0.0f; + + pd[i] = d; + + for (int l = 0; l < qk; l += 2) { + const float v0 = (src[j + i*qk + l + 0])*id; + const float v1 = (src[j + i*qk + l + 1])*id; + + const uint8_t vi0 = ((int8_t) (round(v0))) + 8; + const uint8_t vi1 = ((int8_t) (round(v1))) + 8; + + assert(vi0 >= 0 && vi0 < 16); + assert(vi1 >= 0 && vi1 < 16); + + hist[vi0]++; + hist[vi1]++; + + pp[l/2] = vi0 | (vi1 << 4); + } + + memcpy(pb + i*qk/2, pp, sizeof(pp)); + } + } + } + + return (n/k)*row_size; +} + +size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { + const int nb = k / qk; + const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); + + assert(k % qk == 0); + + uint8_t pp[qk/2]; + + char * pdst = (char *) dst; + + for (int j = 0; j < n; j += k) { + float * pm = (float *) (pdst + (j/k)*row_size); + float * pd = (float *) (pm + nb); + uint8_t * pb = (uint8_t *) (pd + nb); + + //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); + + for (int i = 0; i < nb; i++) { + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); + + { + for (int l = 0; l < qk; l++) { + const float v = src[j + i*qk + l]; + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + pm[i] = min; + pd[i] = d; + + for (int l = 0; l < qk; l += 2) { + const float v0 = (src[j + i*qk + l + 0] - min)*id; + const float v1 = (src[j + i*qk + l + 1] - min)*id; + + const uint8_t vi0 = round(v0); + const uint8_t vi1 = round(v1); + + assert(vi0 >= 0 && vi0 < 16); + assert(vi1 >= 0 && vi1 < 16); + + hist[vi0]++; + hist[vi1]++; + + pp[l/2] = vi0 | (vi1 << 4); + } + + memcpy(pb + i*qk/2, pp, sizeof(pp)); + } + } + } + + return (n/k)*row_size; +} diff --git a/examples/utils.h b/examples/utils.h index d091d3d..f7d0dbc 100644 --- a/examples/utils.h +++ b/examples/utils.h @@ -82,3 +82,9 @@ gpt_vocab::id gpt_sample_top_k_top_p( double temp, std::mt19937 & rng); +// +// Quantization +// + +size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index eacd4f0..8fc292f 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -12,112 +12,9 @@ #include #include +// TODO: move somewhere else #define QK 32 -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pd = (float *) (pdst + (j/k)*row_size); - uint8_t * pb = (uint8_t *) (pd + nb); - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - amax = std::max(amax, fabsf(v)); - } - - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0])*id; - const float v1 = (src[j + i*QK + l + 1])*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) { - const int nb = k / QK; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2); - - assert(k % QK == 0); - - uint8_t pp[QK/2]; - - char * pdst = (char *) dst; - - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); - - //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); - - for (int i = 0; i < nb; i++) { - float min = std::numeric_limits::max(); - float max = std::numeric_limits::min(); - - { - for (int l = 0; l < QK; l++) { - const float v = src[j + i*QK + l]; - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - pm[i] = min; - pd[i] = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = (src[j + i*QK + l + 0] - min)*id; - const float v1 = (src[j + i*QK + l + 1] - min)*id; - - const uint8_t vi0 = round(v0); - const uint8_t vi1 = round(v1); - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb + i*QK/2, pp, sizeof(pp)); - } - } - } - - return (n/k)*row_size; -} - // default hparams (Whisper tiny) struct whisper_hparams { int32_t n_vocab = 51864; @@ -276,6 +173,8 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f std::vector data_f16; std::vector data_f32; + std::vector hist_all(1 << 4, 0); + while (true) { int32_t n_dims; int32_t length; @@ -363,15 +262,16 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f work.resize(nelements); // for quantization size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); } break; default: { @@ -383,7 +283,15 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f fout.write(reinterpret_cast(work.data()), cur_size); total_size_new += cur_size; - printf("size = %8.3f MB -> %8.3f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + printf("size = %8.3f MB -> %8.3f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); } else { printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); @@ -395,6 +303,19 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + { + int64_t sum_all = 0; + for (int i = 0; i < hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } } finp.close();