gpt : support quantisation of f16 models files

gq
Georgi Gerganov 2 years ago
parent 10356cdcdd
commit 2fcbd28143
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -229,9 +229,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) {
int32_t n_dims;
int32_t length;
@ -260,14 +263,6 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
if (ftype != 0) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
data.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
// regexes of tensor names to be quantized
const std::vector<std::string> k_names = {
"model/wte",
@ -286,7 +281,29 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
}
if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@ -306,11 +323,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break;
default:
{
@ -324,9 +341,9 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else {
printf("\n");
fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
total_size_new += nelements * sizeof(float);
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += data_u8.size();
}
total_size_org += nelements * sizeof(float);

@ -232,9 +232,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) {
int32_t n_dims;
int32_t length;
@ -263,14 +266,6 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
if (ftype != 0) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
data.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
// regexes of tensor names to be quantized
const std::vector<std::string> k_names = {
".*weight",
@ -282,14 +277,35 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
quantize = true;
break;
}
}
// quantize only 2D tensors
quantize &= (n_dims == 2);
if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@ -309,11 +325,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break;
default:
{
@ -327,9 +343,9 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else {
printf("\n");
fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
total_size_new += nelements * sizeof(float);
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += data_u8.size();
}
total_size_org += nelements * sizeof(float);

@ -270,11 +270,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) {
int32_t n_dims;
@ -333,13 +333,13 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data.resize(nelements);
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data[i] = ggml_fp16_to_fp32(data_f16[i]);
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype;
@ -367,11 +367,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]);
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break;
default:
{

Loading…
Cancel
Save