From eb062bb012c4e131818dd757a6d3a757fdee3961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20A?= Date: Sun, 12 Mar 2023 17:15:00 -0300 Subject: [PATCH] Windows fixes (#31) * Apply fixes suggested to build on windows Issue: https://github.com/ggerganov/llama.cpp/issues/22 * Remove unsupported VLAs * MSVC: Remove features that are only available on MSVC C++20. * Fix zero initialization of the other fields. * Change the use of vector for stack allocations. --- ggml.c | 20 ++++++++++---------- main.cpp | 12 +++++++----- quantize.cpp | 1 + utils.cpp | 16 ++++++++++++---- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/ggml.c b/ggml.c index 71c3028..fbd7b93 100644 --- a/ggml.c +++ b/ggml.c @@ -407,8 +407,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; - uint8_t * restrict pd = (uint8_t *) (y + 0*bs); - uint8_t * restrict pb = (uint8_t *) (y + 0*bs + sizeof(float)); + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); uint8_t pp[QK/2]; @@ -654,8 +654,8 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; - const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs); - const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float)); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); // scalar for (int i = 0; i < nb; i++) { @@ -1301,11 +1301,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void const size_t bs = sizeof(float) + QK/2; - const uint8_t * restrict pd0 = (const uint8_t *) (x + 0*bs); - const uint8_t * restrict pd1 = (const uint8_t *) (y + 0*bs); + const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); - const uint8_t * restrict pb0 = (const uint8_t *) (x + 0*bs + sizeof(float)); - const uint8_t * restrict pb1 = (const uint8_t *) (y + 0*bs + sizeof(float)); + const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float)); float sumf = 0.0; @@ -1731,8 +1731,8 @@ inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * res const int nb = n / QK; const size_t bs = sizeof(float) + QK/2; - const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs); - const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float)); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); #if __ARM_NEON #if QK == 32 diff --git a/main.cpp b/main.cpp index f02b5dd..a11d755 100644 --- a/main.cpp +++ b/main.cpp @@ -209,8 +209,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // create the ggml context { struct ggml_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, }; model.ctx = ggml_init(params); @@ -546,12 +546,13 @@ bool llama_eval( } struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, }; struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = { .n_threads = n_threads }; + ggml_cgraph gf = {}; + gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -733,6 +734,7 @@ bool llama_eval( } int main(int argc, char ** argv) { + ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); gpt_params params; diff --git a/quantize.cpp b/quantize.cpp index 0ae5373..14c7b27 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -289,6 +289,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // int main(int argc, char ** argv) { + ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); diff --git a/utils.cpp b/utils.cpp index 49023bd..58e7070 100644 --- a/utils.cpp +++ b/utils.cpp @@ -5,6 +5,12 @@ #include #include + #if defined(_MSC_VER) || defined(__MINGW32__) + #include // using malloc.h with MSC/MINGW + #elif !defined(__FreeBSD__) + #include + #endif + bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { for (int i = 1; i < argc; i++) { std::string arg = argv[i]; @@ -472,7 +478,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t assert(k % qk == 0); - uint8_t pp[qk/2]; + const size_t pp_size = qk / 2; + uint8_t *pp = static_cast(alloca(pp_size)); char * pdst = (char *) dst; @@ -511,7 +518,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb, pp, sizeof(pp)); + memcpy(pb, pp, pp_size); pb += bs; } } @@ -526,7 +533,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t assert(k % qk == 0); - uint8_t pp[qk/2]; + const size_t pp_size = qk / 2; + uint8_t *pp = static_cast(alloca(pp_size)); char * pdst = (char *) dst; @@ -570,7 +578,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*qk/2, pp, sizeof(pp)); + memcpy(pb + i*qk/2, pp, pp_size); } } }