Merge branch 'master' of github.com:djthorpe/whisper.cpp

pull/384/head
David Thorpe 3 years ago
commit 8c5ae594ef

@ -130,6 +130,7 @@ if (WHISPER_ALL_WARNINGS)
-Wcast-qual \ -Wcast-qual \
-Wstrict-prototypes \ -Wstrict-prototypes \
-Wpointer-arith \ -Wpointer-arith \
-Wno-unused-function \
") ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-Wall \ -Wall \
@ -156,6 +157,7 @@ else()
if (MSVC) if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
else() else()
if (EMSCRIPTEN) if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")

@ -87,6 +87,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifneq (,$(findstring f16c,$(F16C_M))) ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c CFLAGS += -mf16c
endif endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
else ifeq ($(UNAME_S),Haiku) else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep "AVX ") AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M))) ifneq (,$(findstring avx,$(AVX1_M)))

File diff suppressed because one or more lines are too long

@ -9,7 +9,19 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0 ./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
```
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
Web version: [examples/command.wasm](/examples/command.wasm)
## Guided mode
"Guided mode" allows you to specify a list of commands (i.e. strings) and the transcription will be guided to classify your command into one from the list. This can be useful in situations where a device is listening only for a small subset of commands.
Initial tests show that this approach might be extremely efficient in terms of performance, since it integrates very well with the "partial Encoder" idea from #137.
```bash
# Run in guided mode, the list of allowed commands is in commands.txt # Run in guided mode, the list of allowed commands is in commands.txt
./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt ./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
@ -17,9 +29,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0 ./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
``` ```
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4 https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
Web version: [examples/command.wasm](/examples/command.wasm)
## Building ## Building

@ -33,8 +33,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \ --bind \
-s USE_PTHREADS=1 \ -s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \ -s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1600MB \ -s INITIAL_MEMORY=1800MB \
-s TOTAL_MEMORY=1600MB \ -s TOTAL_MEMORY=1800MB \
-s FORCE_FILESYSTEM=1 \ -s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \ ${EXTRA_FLAGS} \

@ -36,7 +36,7 @@ In order to run this demo efficiently, you need to have the following:
- Latest Chrome or Firefox browser (Safari is not supported) - Latest Chrome or Firefox browser (Safari is not supported)
- Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough) - Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
- Speak phrases that are no longer than 10 seconds - this is the audio context of the AI - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
- The web-page uses about 1.6GB of RAM - The web-page uses about 1.8GB of RAM
Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good. Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
Also, the prompting strategy can likely be improved to achieve better results. Also, the prompting strategy can likely be improved to achieve better results.

466
ggml.c

@ -79,8 +79,11 @@ typedef void* thread_ret_t;
#define static_assert(cond, msg) _Static_assert(cond, msg) #define static_assert(cond, msg) _Static_assert(cond, msg)
#endif #endif
/*#define GGML_PERF*/
#define GGML_DEBUG 0 #define GGML_DEBUG 0
#define GGML_GELU_FP16 #define GGML_GELU_FP16
#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 4
#if UINTPTR_MAX == 0xFFFFFFFF #if UINTPTR_MAX == 0xFFFFFFFF
#define GGML_MEM_ALIGN 4 #define GGML_MEM_ALIGN 4
@ -124,13 +127,8 @@ typedef double ggml_float;
// //
#include <arm_neon.h> #include <arm_neon.h>
float ggml_fp16_to_fp32(ggml_fp16_t x) { #define GGML_COMPUTE_FP16_TO_FP32(x) (x)
return x; #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
}
ggml_fp16_t ggml_fp32_to_fp16(float x) {
return x;
}
#define GGML_FP16_TO_FP32(x) (x) #define GGML_FP16_TO_FP32(x) (x)
#define GGML_FP32_TO_FP16(x) (x) #define GGML_FP32_TO_FP16(x) (x)
@ -150,15 +148,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
#endif #endif
#ifdef __F16C__ #ifdef __F16C__
float ggml_fp16_to_fp32(ggml_fp16_t h) {
return _cvtsh_ss(h);
}
ggml_fp16_t ggml_fp32_to_fp16(float f) {
return _cvtss_sh(f, 0);
}
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#else #else
@ -183,7 +175,7 @@ static inline uint32_t fp32_to_bits(float f) {
return fp32.as_bits; return fp32.as_bits;
} }
float ggml_fp16_to_fp32(ggml_fp16_t h) { static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16; const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w; const uint32_t two_w = w + w;
@ -206,7 +198,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
return fp32_from_bits(result); return fp32_from_bits(result);
} }
ggml_fp16_t ggml_fp32_to_fp16(float f) { static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f; const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f; const float scale_to_zero = 0x1.0p-110f;
@ -232,8 +224,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
} }
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__ #endif // __F16C__
@ -249,6 +241,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
// precomputed exp table for f16 (128 KB) // precomputed exp table for f16 (128 KB)
static ggml_fp16_t table_exp_f16[1 << 16]; static ggml_fp16_t table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB)
static float table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
return GGML_FP16_TO_FP32(x);
}
ggml_fp16_t ggml_fp32_to_fp16(float x) {
return GGML_FP32_TO_FP16(x);
}
// //
// timing // timing
// //
@ -692,6 +712,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
#define GGML_F16_VEC_MUL GGML_F16x4_MUL #define GGML_F16_VEC_MUL GGML_F16x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
#elif defined(__SSE3__)
#define GGML_SIMD
// F32 SSE
#define GGML_F32_STEP 32
#define GGML_F32_EPR 4
#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO _mm_setzero_ps()
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
#define GGML_F32x4_LOAD _mm_loadu_ps
#define GGML_F32x4_STORE _mm_storeu_ps
#if defined(__FMA__)
// TODO: Does this work?
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
#else
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
#endif
#define GGML_F32x4_ADD _mm_add_ps
#define GGML_F32x4_MUL _mm_mul_ps
#define GGML_F32x4_REDUCE(res, x) \
{ \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
} \
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
} \
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
} \
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
}
// TODO: is this optimal ?
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
// F16 SSE
#define GGML_F16_STEP 32
#define GGML_F16_EPR 4
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]);
tmp[1] = GGML_FP16_TO_FP32(x[1]);
tmp[2] = GGML_FP16_TO_FP32(x[2]);
tmp[3] = GGML_FP16_TO_FP32(x[3]);
return _mm_loadu_ps(tmp);
}
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
float arr[4];
_mm_storeu_ps(arr, y);
x[0] = GGML_FP32_TO_FP16(arr[0]);
x[1] = GGML_FP32_TO_FP16(arr[1]);
x[2] = GGML_FP32_TO_FP16(arr[2]);
x[3] = GGML_FP32_TO_FP16(arr[3]);
}
#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD _mm_add_ps
#define GGML_F32Cx4_MUL _mm_mul_ps
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
#define GGML_F16_VEC GGML_F32Cx4
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif #endif
// GGML_F32_ARR / GGML_F16_ARR // GGML_F32_ARR / GGML_F16_ARR
@ -796,6 +911,61 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
*s = sumf; *s = sumf;
} }
// compute GGML_VEC_DOT_UNROLL dot products at once
// xs - x row stride in bytes
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
const ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL] = { xv };
for (int i = 1; i < GGML_VEC_DOT_UNROLL; ++i) {
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
}
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
GGML_F16_VEC ax[GGML_F16_ARR];
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
}
}
}
// reduce sum0..sum3 to sum0
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
}
// leftovers
for (int i = np; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]);
}
}
#else
for (int i = 0; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]);
}
}
#endif
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
s[i] = sumf[i];
}
}
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
#if defined(GGML_SIMD) #if defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1)); const int np = (n & ~(GGML_F32_STEP - 1));
@ -927,7 +1097,30 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
} }
#endif #endif
inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; } inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
ggml_float sum = 0.0;
for (int i = 0; i < n; ++i) {
sum += x[i];
*s += sum;
}
#else
vDSP_sve(x, 1, s, n);
#endif
}
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
ggml_float max = -INFINITY;
for (int i = 0; i < n; ++i) {
max = MAX(max, x[i]);
}
*s = max;
#else
vDSP_maxv(x, 1, s, n);
#endif
}
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); } inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
// //
@ -1181,25 +1374,25 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type]; return GGML_TYPE_SIZE[tensor->type];
} }
bool ggml_is_scalar(const struct ggml_tensor * tensor) { static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
} }
bool ggml_is_vector(const struct ggml_tensor * tensor) { static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
} }
bool ggml_is_matrix(const struct ggml_tensor * tensor) { static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[2] == 1 && tensor->ne[3] == 1; return tensor->ne[2] == 1 && tensor->ne[3] == 1;
} }
bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -1208,7 +1401,7 @@ bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor *
(t0->ne[3] == t1->ne[3]); (t0->ne[3] == t1->ne[3]);
} }
bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -1218,7 +1411,7 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -1227,7 +1420,7 @@ bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -1238,7 +1431,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
} }
// check if t1 can be represented as a repeatition of t0 // check if t1 can be represented as a repeatition of t0
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -1248,14 +1441,20 @@ bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t
(t1->ne[3]%t0->ne[3] == 0); (t1->ne[3]%t0->ne[3] == 0);
} }
int ggml_up32(int n) { static inline int ggml_up32(int n) {
return (n + 31) & ~31; return (n + 31) & ~31;
} }
int ggml_up64(int n) { static inline int ggml_up64(int n) {
return (n + 63) & ~63; return (n + 63) & ~63;
} }
static inline int ggml_up(int n, int m) {
// assert m is a power of 2
GGML_ASSERT((m & (m - 1)) == 0);
return (n + m - 1) & ~(m - 1);
}
// assert that pointer is aligned to GGML_MEM_ALIGN // assert that pointer is aligned to GGML_MEM_ALIGN
#define ggml_assert_aligned(ptr) \ #define ggml_assert_aligned(ptr) \
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
@ -1269,7 +1468,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
static bool is_first_call = true; static bool is_first_call = true;
if (is_first_call) { if (is_first_call) {
// initialize GELU and EXP tables // initialize GELU, EXP and F32 tables
{ {
const uint64_t t_start = ggml_time_us(); UNUSED(t_start); const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
@ -1277,7 +1476,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
for (int i = 0; i < (1 << 16); ++i) { for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i; uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii)); memcpy(&ii, &ui, sizeof(ii));
const float f = GGML_FP16_TO_FP32(ii); const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
} }
@ -1292,7 +1491,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
const uint64_t t_start = ggml_time_us(); UNUSED(t_start); const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
g_state = (struct ggml_state) { g_state = (struct ggml_state) {
/*.contexts =*/ { 0 }, /*.contexts =*/ { { 0 } },
}; };
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@ -4546,7 +4745,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
// TODO: do not support transposed src1 // TODO: do not support transposed src1
assert(nb10/2 == sizeof(ggml_fp16_t)); assert(nb10/2 == sizeof(ggml_fp16_t));
// parallelize by src0 rows using ggml_vec_dot_f32 // parallelize by src0 rows using ggml_vec_dot_f16
// total rows in src0 // total rows in src0
const int nr = ne01*ne02*ne03; const int nr = ne01*ne02*ne03;
@ -4574,13 +4773,13 @@ static void ggml_compute_forward_mul_mat_f16_f32(
const int i3 = i03; const int i3 = i03;
ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
ggml_fp16_t * src1_col = wdata + (i13*ne12*ne11 + i12*ne11 + 0)*ne00; ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00;
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
for (int ic = 0; ic < ne11; ++ic) { assert(ne00 % 32 == 0);
assert(ne00 % 32 == 0);
for (int ic = 0; ic < ne11; ++ic) {
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
} }
} }
@ -4959,21 +5158,19 @@ static void ggml_compute_forward_soft_max_f32(
#endif #endif
float max = -INFINITY; float max = -INFINITY;
for (int i = 0; i < nc; i++) { ggml_vec_max_f32(nc, &max, p);
max = MAX(max, p[i]);
}
ggml_float sum = 0.0; ggml_float sum = 0.0;
uint16_t ss; uint16_t scvt;
for (int i = 0; i < nc; i++) { for (int i = 0; i < nc; i++) {
if (p[i] == -INFINITY) { if (p[i] == -INFINITY) {
p[i] = 0.0; p[i] = 0.0f;
} else { } else {
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
memcpy(&ss, &s, sizeof(ss)); memcpy(&scvt, &s, sizeof(scvt));
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
sum += val; sum += val;
p[i] = val; p[i] = val;
} }
@ -5685,6 +5882,8 @@ static void ggml_compute_forward_flash_attn_f32(
const int P = nek1 - N; const int P = nek1 - N;
const int M = P + N; const int M = P + N;
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
GGML_ASSERT(ne0 == D); GGML_ASSERT(ne0 == D);
GGML_ASSERT(ne1 == N); GGML_ASSERT(ne1 == N);
GGML_ASSERT(P >= 0); GGML_ASSERT(P >= 0);
@ -5737,7 +5936,11 @@ static void ggml_compute_forward_flash_attn_f32(
const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
float * S = (float *) params->wdata + ith*(M + CACHE_LINE_SIZE_F32); float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
for (int i = M; i < Mup; ++i) {
S[i] = -INFINITY;
}
for (int ic = 0; ic < nek1; ++ic) { for (int ic = 0; ic < nek1; ++ic) {
// k indices // k indices
@ -5768,30 +5971,50 @@ static void ggml_compute_forward_flash_attn_f32(
// softmax // softmax
{ {
float max = -INFINITY; float max = -INFINITY;
for (int i = 0; i < M; i++) { ggml_vec_max_f32(M, &max, S);
max = MAX(max, S[i]);
}
ggml_float sum = 0.0; float sum = 0.0f;
{
#ifndef GGML_USE_ACCELERATE
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
uint16_t ss; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
for (int i = 0; i < M; i++) { float * SS = S + i;
if (S[i] == -INFINITY) {
S[i] = 0.0; for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
} else { if (SS[j] == -INFINITY) {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); SS[j] = 0.0f;
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); } else {
memcpy(&ss, &s, sizeof(ss)); ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); memcpy(&scvt[j], &s, sizeof(uint16_t));
sum += val; const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
S[i] = val; sump[j] += val;
SS[j] = val;
}
}
} }
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
sum += sump[i];
}
#else
vvexpf(S, S, &Mup);
ggml_vec_sum_f32(Mup, &sum, S);
#endif
} }
assert(sum > 0.0f); assert(sum > 0.0f);
sum = 1.0/sum; sum = 1.0/sum;
ggml_vec_scale_f32(M, S, sum); ggml_vec_scale_f32(M, S, sum);
#ifndef NDEBUG
for (int i = 0; i < M; ++i) {
assert(!isnan(S[i]));
assert(!isinf(S[i]));
}
#endif
} }
for (int ic = 0; ic < nev1; ++ic) { for (int ic = 0; ic < nev1; ++ic) {
@ -5866,6 +6089,8 @@ static void ggml_compute_forward_flash_attn_f16(
const int P = nek1 - N; const int P = nek1 - N;
const int M = P + N; const int M = P + N;
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
GGML_ASSERT(ne0 == D); GGML_ASSERT(ne0 == D);
GGML_ASSERT(ne1 == N); GGML_ASSERT(ne1 == N);
GGML_ASSERT(P >= 0); GGML_ASSERT(P >= 0);
@ -5918,8 +6143,14 @@ static void ggml_compute_forward_flash_attn_f16(
const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
for (int i = M; i < Mup; ++i) {
S[i] = -INFINITY;
}
// looks like unrolling here does not help
#if 1
for (int ic = 0; ic < nek1; ++ic) { for (int ic = 0; ic < nek1; ++ic) {
// k indices // k indices
const int ik3 = iq3; const int ik3 = iq3;
@ -5934,6 +6165,24 @@ static void ggml_compute_forward_flash_attn_f16(
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
} }
#else
GGML_ASSERT(nek1 % GGML_VEC_DOT_UNROLL == 0);
for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
// k indices
const int ik3 = iq3;
const int ik2 = iq2;
const int ik1 = ic;
// S indices
const int i1 = ik1;
ggml_vec_dot_f16_unroll(neq0, nbk1,
S + i1,
((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
}
#endif
// scale // scale
ggml_vec_scale_f32(nek1, S, scale); ggml_vec_scale_f32(nek1, S, scale);
@ -5949,30 +6198,50 @@ static void ggml_compute_forward_flash_attn_f16(
// softmax // softmax
{ {
float max = -INFINITY; float max = -INFINITY;
for (int i = 0; i < M; i++) { ggml_vec_max_f32(M, &max, S);
max = MAX(max, S[i]);
}
ggml_float sum = 0.0; float sum = 0.0f;
{
#ifndef GGML_USE_ACCELERATE
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
uint16_t ss; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
for (int i = 0; i < M; i++) { float * SS = S + i;
if (S[i] == -INFINITY) {
S[i] = 0.0; for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
} else { if (SS[j] == -INFINITY) {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); SS[j] = 0.0f;
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); } else {
memcpy(&ss, &s, sizeof(ss)); ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); memcpy(&scvt[j], &s, sizeof(uint16_t));
sum += val; const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
S[i] = val; sump[j] += val;
SS[j] = val;
}
}
}
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
sum += sump[i];
} }
#else
vvexpf(S, S, &Mup);
ggml_vec_sum_f32(Mup, &sum, S);
#endif
} }
assert(sum > 0.0f); assert(sum > 0.0f);
sum = 1.0/sum; sum = 1.0/sum;
ggml_vec_scale_f32(M, S, sum); ggml_vec_scale_f32(M, S, sum);
#ifndef NDEBUG
for (int i = 0; i < M; ++i) {
assert(!isnan(S[i]));
assert(!isinf(S[i]));
}
#endif
} }
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
@ -5981,15 +6250,17 @@ static void ggml_compute_forward_flash_attn_f16(
S16[i] = GGML_FP32_TO_FP16(S[i]); S16[i] = GGML_FP32_TO_FP16(S[i]);
} }
for (int ic = 0; ic < nev1; ++ic) { GGML_ASSERT(nev1 % GGML_VEC_DOT_UNROLL == 0);
for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
// dst indices // dst indices
const int i1 = iq1; const int i1 = iq1;
const int i2 = iq2; const int i2 = iq2;
const int i3 = iq3; const int i3 = iq3;
ggml_vec_dot_f16(nek1, ggml_vec_dot_f16_unroll(nek1, nbv1,
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)),
S16); S16);
} }
} }
@ -6871,7 +7142,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
if (state->node) { if (state->node) {
ggml_compute_forward(&state->params, state->node); if (state->params.ith < state->params.nth) {
ggml_compute_forward(&state->params, state->node);
}
state->node = NULL; state->node = NULL;
} else { } else {
break; break;
@ -6965,9 +7238,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
// TODO: use different scheduling for different matrix sizes
node->n_tasks = n_threads; node->n_tasks = n_threads;
// TODO: use different scheduling for different matrix sizes
//const int nr0 = ggml_nrows(node->src0);
//const int nr1 = ggml_nrows(node->src1);
//node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
size_t cur = 0; size_t cur = 0;
// TODO: better way to determine if the matrix is transposed // TODO: better way to determine if the matrix is transposed
@ -6978,6 +7257,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
node->src1->type == GGML_TYPE_F32) { node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]); cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
} else { } else {
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1); cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
@ -7053,14 +7333,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
size_t cur = 0; size_t cur = 0;
const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
if (node->src1->type == GGML_TYPE_F32) { if (node->src1->type == GGML_TYPE_F32) {
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
} }
if (node->src1->type == GGML_TYPE_F16) { if (node->src1->type == GGML_TYPE_F16) {
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -7149,7 +7431,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
workers[j].params = (struct ggml_compute_params) { workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_COMPUTE, .type = GGML_TASK_COMPUTE,
.ith = j + 1, .ith = j + 1,
.nth = n_threads, .nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL, .wdata = cgraph->work ? cgraph->work->data : NULL,
}; };
@ -7204,7 +7486,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
workers[j].params = (struct ggml_compute_params) { workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_FINALIZE, .type = GGML_TASK_FINALIZE,
.ith = j + 1, .ith = j + 1,
.nth = n_threads, .nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL, .wdata = cgraph->work ? cgraph->work->data : NULL,
}; };
@ -8232,6 +8514,14 @@ int ggml_cpu_has_blas(void) {
#endif #endif
} }
int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_vsx(void) { int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__) #if defined(__POWER9_VECTOR__)
return 1; return 1;

@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
int ggml_cpu_has_fp16_va(void); int ggml_cpu_has_fp16_va(void);
int ggml_cpu_has_wasm_simd(void); int ggml_cpu_has_wasm_simd(void);
int ggml_cpu_has_blas(void); int ggml_cpu_has_blas(void);
int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void); int ggml_cpu_has_vsx(void);
#ifdef __cplusplus #ifdef __cplusplus

@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
return s.c_str(); return s.c_str();

Loading…
Cancel
Save