diff --git a/Makefile b/Makefile index 56c3793..20915e3 100644 --- a/Makefile +++ b/Makefile @@ -115,11 +115,15 @@ endif ifeq ($(UNAME_M),amd64) CFLAGS += -mavx -mavx2 -mfma -mf16c endif -ifeq ($(UNAME_M),ppc64le) +ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) CFLAGS += -mpower9-vector endif + # Require c++23's std::byteswap for big-endian support. + ifeq ($(UNAME_M),ppc64) + CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN + endif endif ifndef WHISPER_NO_ACCELERATE # Mac M1 - include Accelerate framework diff --git a/ggml.c b/ggml.c index 16f0f85..d2a5053 100644 --- a/ggml.c +++ b/ggml.c @@ -339,8 +339,12 @@ int64_t ggml_cycles_per_ms(void) { #if defined(__cpp_lib_hardware_interference_size) #define CACHE_LINE_SIZE hardware_destructive_interference_size #else +#if defined(__POWER9_VECTOR__) +#define CACHE_LINE_SIZE 128 +#else #define CACHE_LINE_SIZE 64 #endif +#endif static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); @@ -609,9 +613,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ vec_extract_fp32_from_shortl(vec_xl(0, p)) -#define GGML_F16_VEC_STORE(p, r, i) \ - if (i & 0x1) \ - vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR) +#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define GGML_F16_VEC_STORE(p, r, i) \ + if (i & 0x1) \ + vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ + r[i - GGML_ENDIAN_BYTE(0)]), \ + 0, p - GGML_F16_EPR) #elif defined(__wasm_simd128__) diff --git a/whisper.cpp b/whisper.cpp index 81458a5..d12cc4a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -17,6 +17,68 @@ #include #include +#if defined(GGML_BIG_ENDIAN) +#include + +template +static T byteswap(T value) { + return std::byteswap(value); +} + +template<> +float byteswap(float value) { + return std::bit_cast(byteswap(std::bit_cast(value))); +} + +template +static void byteswap_tensor_data(ggml_tensor * tensor) { + T * datum = reinterpret_cast(tensor->data); + for (int i = 0; i < ggml_nelements(tensor); i++) { + datum[i] = byteswap(datum[i]); + } +} + +static void byteswap_tensor(ggml_tensor * tensor) { + switch (tensor->type) { + case GGML_TYPE_I16: { + byteswap_tensor_data(tensor); + break; + } + case GGML_TYPE_F16: { + byteswap_tensor_data(tensor); + break; + } + case GGML_TYPE_I32: { + byteswap_tensor_data(tensor); + break; + } + case GGML_TYPE_F32: { + byteswap_tensor_data(tensor); + break; + } + default: { // GML_TYPE_I8 + break; + } + } +} + +#define BYTESWAP_VALUE(d) d = byteswap(d) +#define BYTESWAP_FILTERS(f) \ + do { \ + for (auto & datum : f.data) { \ + datum = byteswap(datum); \ + } \ + } while (0) +#define BYTESWAP_TENSOR(t) \ + do { \ + byteswap_tensor(tensor); \ + } while (0) +#else +#define BYTESWAP_VALUE(d) do {} while (0) +#define BYTESWAP_FILTERS(f) do {} while (0) +#define BYTESWAP_TENSOR(t) do {} while (0) +#endif + #define WHISPER_ASSERT(x) \ do { \ if (!(x)) { \ @@ -521,6 +583,7 @@ struct whisper_context { template static void read_safe(whisper_model_loader * loader, T & dest) { loader->read(loader->context, &dest, sizeof(T)); + BYTESWAP_VALUE(dest); } static bool kv_cache_init( @@ -733,6 +796,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con filters.data.resize(filters.n_mel * filters.n_fft); loader->read(loader->context, filters.data.data(), filters.data.size() * sizeof(float)); + BYTESWAP_FILTERS(filters); } // load vocab @@ -1196,6 +1260,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } loader->read(loader->context, tensor->data, ggml_nbytes(tensor)); + BYTESWAP_TENSOR(tensor); //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor);