From 2fcbd28143c6f69895dd71411fe83d36c0665f11 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 26 Feb 2023 19:47:36 +0200
Subject: [PATCH] gpt : support quantisation of f16 models files

---
 examples/gpt-2/quantize.cpp   | 45 +++++++++++++++++++++++-----------
 examples/gpt-j/quantize.cpp   | 46 +++++++++++++++++++++++------------
 examples/whisper/quantize.cpp | 16 ++++++------
 3 files changed, 70 insertions(+), 37 deletions(-)
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
index 08895e7..c9cfddd 100644
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -229,9 +229,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
         size_t total_size_org = 0;
         size_t total_size_new = 0;
 
-        std::vector<float> data;
         std::vector<float> work;
 
+        std::vector<uint8_t>     data_u8;
+        std::vector<ggml_fp16_t> data_f16;
+        std::vector<float>       data_f32;
+
         while (true) {
             int32_t n_dims;
             int32_t length;
@@ -260,14 +263,6 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                 printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
-            if (ftype != 0) {
-                fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
-                return false;
-            }
-
-            data.resize(nelements);
-            finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
-
             // regexes of tensor names to be quantized
             const std::vector<std::string> k_names = {
                 "model/wte",
@@ -286,7 +281,29 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
             }
 
             if (quantize) {
+                if (ftype != 0 && ftype != 1) {
+                    fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
+                    return false;
+                }
+
+                if (ftype == 1) {
+                    data_f16.resize(nelements);
+                    finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                    data_f32.resize(nelements);
+                    for (int i = 0; i < nelements; ++i) {
+                        data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                    }
+                } else {
+                    data_f32.resize(nelements);
+                    finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+                }
+
                 ftype = itype;
+            } else {
+                const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+                data_u8.resize(nelements*bpe);
+                finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
             }
 
             fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@@ -306,11 +323,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                 switch (type) {
                     case GGML_TYPE_Q4_0:
                         {
-                            cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     case GGML_TYPE_Q4_1:
                         {
-                            cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     default:
                         {
@@ -324,9 +341,9 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
 
                 printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
             } else {
-                printf("\n");
-                fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
-                total_size_new += nelements * sizeof(float);
+                printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+                total_size_new += data_u8.size();
             }
 
             total_size_org += nelements * sizeof(float);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
index 2bad404..8597739 100644
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -232,9 +232,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
         size_t total_size_org = 0;
         size_t total_size_new = 0;
 
-        std::vector<float> data;
         std::vector<float> work;
 
+        std::vector<uint8_t>     data_u8;
+        std::vector<ggml_fp16_t> data_f16;
+        std::vector<float>       data_f32;
+
         while (true) {
             int32_t n_dims;
             int32_t length;
@@ -263,14 +266,6 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                 printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
-            if (ftype != 0) {
-                fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
-                return false;
-            }
-
-            data.resize(nelements);
-            finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
-
             // regexes of tensor names to be quantized
             const std::vector<std::string> k_names = {
                 ".*weight",
@@ -282,14 +277,35 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                     quantize = true;
                     break;
                 }
-
             }
 
             // quantize only 2D tensors
             quantize &= (n_dims == 2);
 
             if (quantize) {
+                if (ftype != 0 && ftype != 1) {
+                    fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
+                    return false;
+                }
+
+                if (ftype == 1) {
+                    data_f16.resize(nelements);
+                    finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                    data_f32.resize(nelements);
+                    for (int i = 0; i < nelements; ++i) {
+                        data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                    }
+                } else {
+                    data_f32.resize(nelements);
+                    finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+                }
+
                 ftype = itype;
+            } else {
+                const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+                data_u8.resize(nelements*bpe);
+                finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
             }
 
             fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@@ -309,11 +325,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                 switch (type) {
                     case GGML_TYPE_Q4_0:
                         {
-                            cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     case GGML_TYPE_Q4_1:
                         {
-                            cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     default:
                         {
@@ -327,9 +343,9 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
 
                 printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
             } else {
-                printf("\n");
-                fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
-                total_size_new += nelements * sizeof(float);
+                printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+                total_size_new += data_u8.size();
             }
 
             total_size_org += nelements * sizeof(float);
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
index 3752b67..eacd4f0 100644
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -270,11 +270,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
         size_t total_size_org = 0;
         size_t total_size_new = 0;
 
-        std::vector<float> data;
         std::vector<float> work;
 
-        std::vector<uint8_t> data_u8;
+        std::vector<uint8_t>     data_u8;
         std::vector<ggml_fp16_t> data_f16;
+        std::vector<float>       data_f32;
 
         while (true) {
             int32_t n_dims;
@@ -333,13 +333,13 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                 if (ftype == 1) {
                     data_f16.resize(nelements);
                     finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
-                    data.resize(nelements);
+                    data_f32.resize(nelements);
                     for (int i = 0; i < nelements; ++i) {
-                        data[i] = ggml_fp16_to_fp32(data_f16[i]);
+                        data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
                     }
                 } else {
-                    data.resize(nelements);
-                    finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
+                    data_f32.resize(nelements);
+                    finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
                 }
 
                 ftype = itype;
@@ -367,11 +367,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                 switch (type) {
                     case GGML_TYPE_Q4_0:
                         {
-                            cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     case GGML_TYPE_Q4_1:
                         {
-                            cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
                         } break;
                     default:
                         {