utils : print quantization histograms

2 years ago · 3adf02e311
parent 05e7d26ba4
commit 3adf02e311
5 changed files with 200 additions and 321 deletions
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@ -12,112 +12,9 @@
 #include <vector>
 #include <regex>

+// TODO: move somewhere else
 #define QK 32

-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pd = (float *)   (pdst + (j/k)*row_size);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f; // absolute max
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    amax = std::max(amax, fabsf(v));
-                }
-
-                const float d = amax / ((1 << 3) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0])*id;
-                    const float v1 = (src[j + i*QK + l + 1])*id;
-
-                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
-
-        for (int i = 0; i < nb; i++) {
-            float min = std::numeric_limits<float>::max();
-            float max = std::numeric_limits<float>::min();
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                const float d = (max - min) / ((1 << 4) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pm[i] = min;
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0] - min)*id;
-                    const float v1 = (src[j + i*QK + l + 1] - min)*id;
-
-                    const uint8_t vi0 = round(v0);
-                    const uint8_t vi1 = round(v1);
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -235,6 +132,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        std::vector<ggml_fp16_t> data_f16;
        std::vector<float>       data_f32;

+        std::vector<int64_t> hist_all(1 << 4, 0);
+
        while (true) {
            int32_t n_dims;
            int32_t length;
@ -319,15 +218,16 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                work.resize(nelements); // for quantization

                size_t cur_size = 0;
+                std::vector<int64_t> hist_cur(1 << 4, 0);

                switch (type) {
                    case GGML_TYPE_Q4_0:
                        {
-                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    case GGML_TYPE_Q4_1:
                        {
-                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    default:
                        {
@ -339,7 +239,15 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                fout.write(reinterpret_cast<char *>(work.data()), cur_size);
                total_size_new += cur_size;

-                printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    hist_all[i] += hist_cur[i];
+                }
+
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    printf("%5.3f ", hist_cur[i] / (float)nelements);
+                }
+                printf("\n");
            } else {
                printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@ -351,6 +259,19 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam

        printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
        printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+
+        {
+            int64_t sum_all = 0;
+            for (int i = 0; i < hist_all.size(); ++i) {
+                sum_all += hist_all[i];
+            }
+
+            printf("%s: hist: ", __func__);
+            for (int i = 0; i < hist_all.size(); ++i) {
+                printf("%5.3f ", hist_all[i] / (float)sum_all);
+            }
+            printf("\n");
+        }
    }

    finp.close();
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@ -12,112 +12,9 @@
 #include <vector>
 #include <regex>

+// TODO: move somewhere else
 #define QK 32

-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pd = (float *)   (pdst + (j/k)*row_size);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f; // absolute max
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    amax = std::max(amax, fabsf(v));
-                }
-
-                const float d = amax / ((1 << 3) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0])*id;
-                    const float v1 = (src[j + i*QK + l + 1])*id;
-
-                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
-
-        for (int i = 0; i < nb; i++) {
-            float min = std::numeric_limits<float>::max();
-            float max = std::numeric_limits<float>::min();
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                const float d = (max - min) / ((1 << 4) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pm[i] = min;
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0] - min)*id;
-                    const float v1 = (src[j + i*QK + l + 1] - min)*id;
-
-                    const uint8_t vi0 = round(v0);
-                    const uint8_t vi1 = round(v1);
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
 // default hparams (GPT-J 6B)
 struct gptj_hparams {
    int32_t n_vocab = 50400;
@ -238,6 +135,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        std::vector<ggml_fp16_t> data_f16;
        std::vector<float>       data_f32;

+        std::vector<int64_t> hist_all(1 << 4, 0);
+
        while (true) {
            int32_t n_dims;
            int32_t length;
@ -321,15 +220,16 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                work.resize(nelements); // for quantization

                size_t cur_size = 0;
+                std::vector<int64_t> hist_cur(1 << 4, 0);

                switch (type) {
                    case GGML_TYPE_Q4_0:
                        {
-                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    case GGML_TYPE_Q4_1:
                        {
-                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    default:
                        {
@ -341,7 +241,15 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                fout.write(reinterpret_cast<char *>(work.data()), cur_size);
                total_size_new += cur_size;

-                printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    hist_all[i] += hist_cur[i];
+                }
+
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    printf("%5.3f ", hist_cur[i] / (float)nelements);
+                }
+                printf("\n");
            } else {
                printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@ -353,6 +261,19 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam

        printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
        printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+
+        {
+            int64_t sum_all = 0;
+            for (int i = 0; i < hist_all.size(); ++i) {
+                sum_all += hist_all[i];
+            }
+
+            printf("%s: hist: ", __func__);
+            for (int i = 0; i < hist_all.size(); ++i) {
+                printf("%5.3f ", hist_all[i] / (float)sum_all);
+            }
+            printf("\n");
+        }
    }

    finp.close();
--- a/examples/utils.cpp
+++ b/examples/utils.cpp
@ -328,3 +328,113 @@ gpt_vocab::id gpt_sample_top_k_top_p(

    return logits_id[idx].second;
 }
+
+size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+    const int nb = k / qk;
+    const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*qk/2);
+
+    assert(k % qk == 0);
+
+    uint8_t pp[qk/2];
+
+    char * pdst = (char *) dst;
+
+    for (int j = 0; j < n; j += k) {
+        float   * pd = (float *)   (pdst + (j/k)*row_size);
+        uint8_t * pb = (uint8_t *) (pd + nb);
+
+        for (int i = 0; i < nb; i++) {
+            float amax = 0.0f; // absolute max
+
+            {
+                for (int l = 0; l < qk; l++) {
+                    const float v = src[j + i*qk + l];
+                    amax = std::max(amax, fabsf(v));
+                }
+
+                const float d = amax / ((1 << 3) - 1);
+                const float id = d ? 1.0f/d : 0.0f;
+
+                pd[i] = d;
+
+                for (int l = 0; l < qk; l += 2) {
+                    const float v0 = (src[j + i*qk + l + 0])*id;
+                    const float v1 = (src[j + i*qk + l + 1])*id;
+
+                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
+                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
+
+                    assert(vi0 >= 0 && vi0 < 16);
+                    assert(vi1 >= 0 && vi1 < 16);
+
+                    hist[vi0]++;
+                    hist[vi1]++;
+
+                    pp[l/2] = vi0 | (vi1 << 4);
+                }
+
+                memcpy(pb + i*qk/2, pp, sizeof(pp));
+            }
+        }
+    }
+
+    return (n/k)*row_size;
+}
+
+size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+    const int nb = k / qk;
+    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
+
+    assert(k % qk == 0);
+
+    uint8_t pp[qk/2];
+
+    char * pdst = (char *) dst;
+
+    for (int j = 0; j < n; j += k) {
+        float   * pm = (float *)   (pdst + (j/k)*row_size);
+        float   * pd = (float *)   (pm + nb);
+        uint8_t * pb = (uint8_t *) (pd + nb);
+
+        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
+
+        for (int i = 0; i < nb; i++) {
+            float min = std::numeric_limits<float>::max();
+            float max = std::numeric_limits<float>::min();
+
+            {
+                for (int l = 0; l < qk; l++) {
+                    const float v = src[j + i*qk + l];
+                    if (v < min) min = v;
+                    if (v > max) max = v;
+                }
+
+                const float d = (max - min) / ((1 << 4) - 1);
+                const float id = d ? 1.0f/d : 0.0f;
+
+                pm[i] = min;
+                pd[i] = d;
+
+                for (int l = 0; l < qk; l += 2) {
+                    const float v0 = (src[j + i*qk + l + 0] - min)*id;
+                    const float v1 = (src[j + i*qk + l + 1] - min)*id;
+
+                    const uint8_t vi0 = round(v0);
+                    const uint8_t vi1 = round(v1);
+
+                    assert(vi0 >= 0 && vi0 < 16);
+                    assert(vi1 >= 0 && vi1 < 16);
+
+                    hist[vi0]++;
+                    hist[vi1]++;
+
+                    pp[l/2] = vi0 | (vi1 << 4);
+                }
+
+                memcpy(pb + i*qk/2, pp, sizeof(pp));
+            }
+        }
+    }
+
+    return (n/k)*row_size;
+}
--- a/examples/utils.h
+++ b/examples/utils.h
@ -82,3 +82,9 @@ gpt_vocab::id gpt_sample_top_k_top_p(
        double temp,
        std::mt19937 & rng);

+//
+// Quantization
+//
+
+size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@ -12,112 +12,9 @@
 #include <vector>
 #include <regex>

+// TODO: move somewhere else
 #define QK 32

-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pd = (float *)   (pdst + (j/k)*row_size);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f; // absolute max
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    amax = std::max(amax, fabsf(v));
-                }
-
-                const float d = amax / ((1 << 3) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0])*id;
-                    const float v1 = (src[j + i*QK + l + 1])*id;
-
-                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k) {
-    const int nb = k / QK;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*QK/2);
-
-    assert(k % QK == 0);
-
-    uint8_t pp[QK/2];
-
-    char * pdst = (char *) dst;
-
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
-
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
-
-        for (int i = 0; i < nb; i++) {
-            float min = std::numeric_limits<float>::max();
-            float max = std::numeric_limits<float>::min();
-
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j + i*QK + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                const float d = (max - min) / ((1 << 4) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                pm[i] = min;
-                pd[i] = d;
-
-                for (int l = 0; l < QK; l += 2) {
-                    const float v0 = (src[j + i*QK + l + 0] - min)*id;
-                    const float v1 = (src[j + i*QK + l + 1] - min)*id;
-
-                    const uint8_t vi0 = round(v0);
-                    const uint8_t vi1 = round(v1);
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb + i*QK/2, pp, sizeof(pp));
-            }
-        }
-    }
-
-    return (n/k)*row_size;
-}
-
 // default hparams (Whisper tiny)
 struct whisper_hparams {
    int32_t n_vocab       = 51864;
@ -276,6 +173,8 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        std::vector<ggml_fp16_t> data_f16;
        std::vector<float>       data_f32;

+        std::vector<int64_t> hist_all(1 << 4, 0);
+
        while (true) {
            int32_t n_dims;
            int32_t length;
@ -363,15 +262,16 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                work.resize(nelements); // for quantization

                size_t cur_size = 0;
+                std::vector<int64_t> hist_cur(1 << 4, 0);

                switch (type) {
                    case GGML_TYPE_Q4_0:
                        {
-                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    case GGML_TYPE_Q4_1:
                        {
-                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
                        } break;
                    default:
                        {
@ -383,7 +283,15 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                fout.write(reinterpret_cast<char *>(work.data()), cur_size);
                total_size_new += cur_size;

-                printf("size = %8.3f MB -> %8.3f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                printf("size = %8.3f MB -> %8.3f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    hist_all[i] += hist_cur[i];
+                }
+
+                for (int i = 0; i < hist_cur.size(); ++i) {
+                    printf("%5.3f ", hist_cur[i] / (float)nelements);
+                }
+                printf("\n");
            } else {
                printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@ -395,6 +303,19 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f

        printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
        printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+
+        {
+            int64_t sum_all = 0;
+            for (int i = 0; i < hist_all.size(); ++i) {
+                sum_all += hist_all[i];
+            }
+
+            printf("%s: hist: ", __func__);
+            for (int i = 0; i < hist_all.size(); ++i) {
+                printf("%5.3f ", hist_all[i] / (float)sum_all);
+            }
+            printf("\n");
+        }
    }

    finp.close();