gpt : support quantisation of f16 models files

gq
Georgi Gerganov 2 years ago
parent 10356cdcdd
commit 2fcbd28143
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -229,9 +229,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
size_t total_size_org = 0; size_t total_size_org = 0;
size_t total_size_new = 0; size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work; std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t length;
@ -260,14 +263,6 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
} }
if (ftype != 0) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
data.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
// regexes of tensor names to be quantized // regexes of tensor names to be quantized
const std::vector<std::string> k_names = { const std::vector<std::string> k_names = {
"model/wte", "model/wte",
@ -286,7 +281,29 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
} }
if (quantize) { if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype; ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
} }
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@ -306,11 +323,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
{ {
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
default: default:
{ {
@ -324,9 +341,9 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else { } else {
printf("\n"); printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float)); fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += nelements * sizeof(float); total_size_new += data_u8.size();
} }
total_size_org += nelements * sizeof(float); total_size_org += nelements * sizeof(float);

@ -232,9 +232,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
size_t total_size_org = 0; size_t total_size_org = 0;
size_t total_size_new = 0; size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work; std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t length;
@ -263,14 +266,6 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
} }
if (ftype != 0) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
data.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float));
// regexes of tensor names to be quantized // regexes of tensor names to be quantized
const std::vector<std::string> k_names = { const std::vector<std::string> k_names = {
".*weight", ".*weight",
@ -282,14 +277,35 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
quantize = true; quantize = true;
break; break;
} }
} }
// quantize only 2D tensors // quantize only 2D tensors
quantize &= (n_dims == 2); quantize &= (n_dims == 2);
if (quantize) { if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype; ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
} }
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@ -309,11 +325,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
{ {
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
default: default:
{ {
@ -327,9 +343,9 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else { } else {
printf("\n"); printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data.data()), nelements * sizeof(float)); fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += nelements * sizeof(float); total_size_new += data_u8.size();
} }
total_size_org += nelements * sizeof(float); total_size_org += nelements * sizeof(float);

@ -270,11 +270,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
size_t total_size_org = 0; size_t total_size_org = 0;
size_t total_size_new = 0; size_t total_size_new = 0;
std::vector<float> data;
std::vector<float> work; std::vector<float> work;
std::vector<uint8_t> data_u8; std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16; std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
@ -333,13 +333,13 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
if (ftype == 1) { if (ftype == 1) {
data_f16.resize(nelements); data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t)); finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data.resize(nelements); data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) { for (int i = 0; i < nelements; ++i) {
data[i] = ggml_fp16_to_fp32(data_f16[i]); data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
} }
} else { } else {
data.resize(nelements); data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data.data()), nelements * sizeof(float)); finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
} }
ftype = itype; ftype = itype;
@ -367,11 +367,11 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
cur_size = ggml_quantize_q4_0(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
{ {
cur_size = ggml_quantize_q4_1(data.data(), work.data(), nelements, ne[0]); cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0]);
} break; } break;
default: default:
{ {

Loading…
Cancel
Save