From 2fcbd28143c6f69895dd71411fe83d36c0665f11 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Feb 2023 19:47:36 +0200 Subject: [PATCH] gpt : support quantisation of f16 models files --- examples/gpt-2/quantize.cpp | 45 +++++++++++++++++++++++----------- examples/gpt-j/quantize.cpp | 46 +++++++++++++++++++++++------------ examples/whisper/quantize.cpp | 16 ++++++------ 3 files changed, 70 insertions(+), 37 deletions(-) diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp index 08895e7..c9cfddd 100644 --- a/examples/gpt-2/quantize.cpp +++ b/examples/gpt-2/quantize.cpp @@ -229,9 +229,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam size_t total_size_org = 0; size_t total_size_new = 0; - std::vector data; std::vector work; + std::vector data_u8; + std::vector data_f16; + std::vector data_f32; + while (true) { int32_t n_dims; int32_t length; @@ -260,14 +263,6 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } - if (ftype != 0) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - data.resize(nelements); - finp.read(reinterpret_cast(data.data()), nelements * sizeof(float)); - // regexes of tensor names to be quantized const std::vector k_names = { "model/wte", @@ -286,7 +281,29 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam } if (quantize) { + if (ftype != 0 && ftype != 1) { + fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); + return false; + } + + if (ftype == 1) { + data_f16.resize(nelements); + finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + data_f32.resize(nelements); + for (int i = 0; i < nelements; ++i) { + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + } + } else { + data_f32.resize(nelements); + finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + } + ftype = itype; + } else { + const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); + + data_u8.resize(nelements*bpe); + finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); } fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); @@ -306,11 +323,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); } break; default: { @@ -324,9 +341,9 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); } else { - printf("\n"); - fout.write(reinterpret_cast(data.data()), nelements * sizeof(float)); - total_size_new += nelements * sizeof(float); + printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); + fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); + total_size_new += data_u8.size(); } total_size_org += nelements * sizeof(float); diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp index 2bad404..8597739 100644 --- a/examples/gpt-j/quantize.cpp +++ b/examples/gpt-j/quantize.cpp @@ -232,9 +232,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam size_t total_size_org = 0; size_t total_size_new = 0; - std::vector data; std::vector work; + std::vector data_u8; + std::vector data_f16; + std::vector data_f32; + while (true) { int32_t n_dims; int32_t length; @@ -263,14 +266,6 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } - if (ftype != 0) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - data.resize(nelements); - finp.read(reinterpret_cast(data.data()), nelements * sizeof(float)); - // regexes of tensor names to be quantized const std::vector k_names = { ".*weight", @@ -282,14 +277,35 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam quantize = true; break; } - } // quantize only 2D tensors quantize &= (n_dims == 2); if (quantize) { + if (ftype != 0 && ftype != 1) { + fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); + return false; + } + + if (ftype == 1) { + data_f16.resize(nelements); + finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + data_f32.resize(nelements); + for (int i = 0; i < nelements; ++i) { + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + } + } else { + data_f32.resize(nelements); + finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + } + ftype = itype; + } else { + const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); + + data_u8.resize(nelements*bpe); + finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); } fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); @@ -309,11 +325,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); } break; default: { @@ -327,9 +343,9 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); } else { - printf("\n"); - fout.write(reinterpret_cast(data.data()), nelements * sizeof(float)); - total_size_new += nelements * sizeof(float); + printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); + fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); + total_size_new += data_u8.size(); } total_size_org += nelements * sizeof(float); diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index 3752b67..eacd4f0 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -270,11 +270,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f size_t total_size_org = 0; size_t total_size_new = 0; - std::vector data; std::vector work; - std::vector data_u8; + std::vector data_u8; std::vector data_f16; + std::vector data_f32; while (true) { int32_t n_dims; @@ -333,13 +333,13 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f if (ftype == 1) { data_f16.resize(nelements); finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data.resize(nelements); + data_f32.resize(nelements); for (int i = 0; i < nelements; ++i) { - data[i] = ggml_fp16_to_fp32(data_f16[i]); + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); } } else { - data.resize(nelements); - finp.read(reinterpret_cast(data.data()), nelements * sizeof(float)); + data_f32.resize(nelements); + finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); } ftype = itype; @@ -367,11 +367,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]); } break; default: {