Merge branch 'master' of github.com:ggerganov/whisper.cpp

3 years ago · b6740c090f
parent fe6670dc68 1944e7c33e
commit b6740c090f
22 changed files with 259 additions and 301 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -235,3 +235,33 @@ jobs:
              with:
                name: whisper-blas-bin-${{ matrix.arch }}
                path: build/bin/${{ matrix.build }}
+
+    emscripten:
+        runs-on: ubuntu-latest
+
+        strategy:
+            matrix:
+                build: [Release]
+
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1
+
+            - name: Dependencies
+              run: |
+                wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+                tar -xvf master.tar.gz
+                emsdk-master/emsdk update
+                emsdk-master/emsdk install latest
+                emsdk-master/emsdk activate latest
+
+            - name: Configure
+              run: echo "tmp"
+
+            - name: Build
+              run: |
+                pushd emsdk-master
+                source ./emsdk_env.sh
+                popd
+                emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                make
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,15 +1,16 @@
-cmake_minimum_required (VERSION 3.0)
+cmake_minimum_required (VERSION 3.19)

 project(whisper.cpp VERSION 1.0.4)

-set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")

 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
-    include(cmake/GitVars.cmake)
-    include(cmake/BuildTypes.cmake)
+    include(GitVars)
+    include(BuildTypes)

    # configure project version
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
@ -82,9 +83,6 @@ endif()

 # dependencies

-set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 11)
-
 find_package(Threads REQUIRED)

 # on APPLE - include Accelerate framework
@ -190,6 +188,8 @@ add_library(${TARGET}
    whisper.cpp
    )

+include(DefaultTargetOptions)
+
 target_include_directories(${TARGET} PUBLIC
    .
    )
--- a/4
+++ b/4
@ -159,8 +159,8 @@ $(info I UNAME_M:  $(UNAME_M))
 $(info I CFLAGS:   $(CFLAGS))
 $(info I CXXFLAGS: $(CXXFLAGS))
 $(info I LDFLAGS:  $(LDFLAGS))
-$(info I CC:       $(CC) $(CCV))
-$(info I CXX:      $(CXX) $(CXXV))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
 $(info )

 default: main
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
 - AVX intrinsics support for x86 architectures
+- VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
--- a/bindings/ios
+++ b/bindings/ios
@ -1 +1 @@
-Subproject commit 1502317fe050c1d5ae1a81082153b8f00d50d9cd
+Subproject commit 6707f1ea1caa7ee5e9b6908101fe98775e582cff
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -0,0 +1,17 @@
+# Set the default compile features and properties for a target.
+
+if (NOT TARGET)
+    message(FATAL_ERROR "TARGET not set before including DefaultTargetOptions")
+endif()
+
+target_compile_features(${TARGET}
+    PRIVATE
+        cxx_std_11
+    )
+
+set_target_properties(${TARGET}
+    PROPERTIES
+        EXPORT_COMPILE_COMMANDS ON
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
+)
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/bench/CMakeLists.txt
+++ b/examples/bench/CMakeLists.txt
@ -1,3 +1,6 @@
 set(TARGET bench)
 add_executable(${TARGET} bench.cpp)
+
+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -2,6 +2,9 @@ if (WHISPER_SUPPORT_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -1,3 +1,6 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
+
+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -176,40 +176,27 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

    const int n_segments = whisper_full_n_segments(ctx);

+    std::string speaker = "";
+
+    int64_t t0;
+    int64_t t1;
+
    // print the last n_new segments
    const int s0 = n_segments - n_new;
+
    if (s0 == 0) {
        printf("\n");
    }

    for (int i = s0; i < n_segments; i++) {
-        if (params.no_timestamps) {
-            if (params.print_colors) {
-                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special == false) {
-                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                        if (id >= whisper_token_eot(ctx)) {
-                            continue;
-                        }
+        if (!params.no_timestamps || params.diarize) {
+            t0 = whisper_full_get_segment_t0(ctx, i);
+            t1 = whisper_full_get_segment_t1(ctx, i);
        }

-                    const char * text = whisper_full_get_token_text(ctx, i, j);
-                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
-                }
-            } else {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                printf("%s", text);
+        if (!params.no_timestamps) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
        }
-            fflush(stdout);
-        } else {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-            std::string speaker;

        if (params.diarize && pcmf32s.size() == 2) {
            const int64_t n_samples = pcmf32s[0].size();
@ -237,7 +224,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
        }

        if (params.print_colors) {
-                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                if (params.print_special == false) {
                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
@ -253,13 +239,18 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
            }
-                printf("\n");
        } else {
            const char * text = whisper_full_get_segment_text(ctx, i);

-                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
+            printf("%s%s", speaker.c_str(), text);
        }
+
+        // with timestamps or speakers: each segment on new line
+        if (!params.no_timestamps || params.diarize) {
+            printf("\n");
        }
+
+        fflush(stdout);
    }
 }

@ -557,7 +548,7 @@ int main(int argc, char ** argv) {
            }

            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
                return 8;
            }

--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -2,6 +2,9 @@ if (WHISPER_SUPPORT_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -9,6 +9,8 @@ add_executable(${TARGET}
    gpt-2.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -8,6 +8,9 @@ if (WHISPER_SUPPORT_SDL2)
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/ggml.c
+++ b/ggml.c
@ -88,9 +88,6 @@ typedef void* thread_ret_t;
    #define GGML_MEM_ALIGN 16
 #endif

-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)

@ -108,6 +105,11 @@ typedef void* thread_ret_t;
 #include <cblas.h>
 #endif

+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 // floating point type used to accumulate sums
 typedef double ggml_float;

@ -417,8 +419,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
    #define GGML_F16_VEC                GGML_F16x8
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD   GGML_F16x8_LOAD
-    #define GGML_F16_VEC_STORE  GGML_F16x8_STORE
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
@ -443,8 +445,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
    #define GGML_F16_VEC                GGML_F32Cx4
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD   GGML_F32Cx4_LOAD
-    #define GGML_F16_VEC_STORE  GGML_F32Cx4_STORE
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
@ -521,8 +523,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 #define GGML_F16_VEC                GGML_F32Cx8
 #define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
 #define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD   GGML_F32Cx8_LOAD
-#define GGML_F16_VEC_STORE  GGML_F32Cx8_STORE
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
 #define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
 #define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
 #define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
@ -530,20 +532,18 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);

 #elif defined(__POWER9_VECTOR__)

-// TODO: uncomment this when it works
-//#define GGML_SIMD
+#define GGML_SIMD

 // F32 POWER9

 #define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
+#define GGML_F32_EPR  4

-// TODO: not tested !!
-#define GGML_F32x4         __vector float
-#define GGML_F32x4_ZERO    (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
-#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
-#define GGML_F32x4_LOAD    vec_vsx_ld
-#define GGML_F32x4_STORE   vec_vsx_st
+#define GGML_F32x4              vector float
+#define GGML_F32x4_ZERO         0.0f
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
 #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
 #define GGML_F32x4_ADD          vec_add
 #define GGML_F32x4_MUL          vec_mul
@ -575,8 +575,20 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

 // F16 POWER9
-// TODO: implement here
-// ...
+#define GGML_F16_STEP       GGML_F32_STEP
+#define GGML_F16_EPR        GGML_F32_EPR
+#define GGML_F16_VEC        GGML_F32x4
+#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_F16_VEC_STORE(p, r, i)                                      \
+  if (i & 0x1)                                                           \
+    vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR)

 #elif defined(__wasm_simd128__)

@ -677,8 +689,8 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 #define GGML_F16_VEC                GGML_F16x4
 #define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
 #define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD   GGML_F16x4_LOAD
-#define GGML_F16_VEC_STORE  GGML_F16x4_STORE
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
 #define GGML_F16_VEC_FMA            GGML_F16x4_FMA
 #define GGML_F16_VEC_ADD            GGML_F16x4_ADD
 #define GGML_F16_VEC_MUL            GGML_F16x4_MUL
@ -860,8 +872,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t

    for (int i = 0; i < np; i += GGML_F16_STEP) {
        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);

            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
        }
@ -874,76 +886,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    for (int i = np; i < n; ++i) {
        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
-#elif defined(__POWER9_VECTOR__)
-    // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
-    //       being able to test it. hoping someone with access to a POWER9 machine can help out here.
-    const int n32 = (n & ~31);
-
-    vector float sum0 = vec_splats (0.0f);
-    vector float sum1 = vec_splats (0.0f);
-    vector float sum2 = vec_splats (0.0f);
-    vector float sum3 = vec_splats (0.0f);
-    vector float sum4 = vec_splats (0.0f);
-    vector float sum5 = vec_splats (0.0f);
-    vector float sum6 = vec_splats (0.0f);
-    vector float sum7 = vec_splats (0.0f);
-
-    for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
-        // Use vec_xl, not vec_ld, because x is sometimes unaligned.
-        vector unsigned short x0 = vec_xl(j +  0, x);
-        vector unsigned short x1 = vec_xl(j + 16, x);
-        vector unsigned short x2 = vec_xl(j + 32, x);
-        vector unsigned short x3 = vec_xl(j + 48, x);
-
-        vector unsigned short y0 = vec_ld(j +  0, y);
-        vector unsigned short y1 = vec_ld(j + 16, y);
-        vector unsigned short y2 = vec_ld(j + 32, y);
-        vector unsigned short y3 = vec_ld(j + 48, y);
-
-        vector float fx0l = vec_extract_fp32_from_shortl(x0);
-        vector float fx0h = vec_extract_fp32_from_shorth(x0);
-        vector float fx1l = vec_extract_fp32_from_shortl(x1);
-        vector float fx1h = vec_extract_fp32_from_shorth(x1);
-        vector float fx2l = vec_extract_fp32_from_shortl(x2);
-        vector float fx2h = vec_extract_fp32_from_shorth(x2);
-        vector float fx3l = vec_extract_fp32_from_shortl(x3);
-        vector float fx3h = vec_extract_fp32_from_shorth(x3);
-
-        vector float fy0l = vec_extract_fp32_from_shortl(y0);
-        vector float fy0h = vec_extract_fp32_from_shorth(y0);
-        vector float fy1l = vec_extract_fp32_from_shortl(y1);
-        vector float fy1h = vec_extract_fp32_from_shorth(y1);
-        vector float fy2l = vec_extract_fp32_from_shortl(y2);
-        vector float fy2h = vec_extract_fp32_from_shorth(y2);
-        vector float fy3l = vec_extract_fp32_from_shortl(y3);
-        vector float fy3h = vec_extract_fp32_from_shorth(y3);
-
-        sum0 = vec_madd(fx0l, fy0l, sum0);
-        sum1 = vec_madd(fx0h, fy0h, sum1);
-        sum2 = vec_madd(fx1l, fy1l, sum2);
-        sum3 = vec_madd(fx1h, fy1h, sum3);
-        sum4 = vec_madd(fx2l, fy2l, sum4);
-        sum5 = vec_madd(fx2h, fy2h, sum5);
-        sum6 = vec_madd(fx3l, fy3l, sum6);
-        sum7 = vec_madd(fx3h, fy3h, sum7);
-    }
-
-    sum0 = vec_add(sum0, sum1);
-    sum2 = vec_add(sum2, sum3);
-    sum4 = vec_add(sum4, sum5);
-    sum6 = vec_add(sum6, sum7);
-
-    sum0 = vec_add(sum0, sum2);
-    sum4 = vec_add(sum4, sum6);
-
-    sum0 = vec_add(sum0, sum4);
-
-    sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
-         + vec_extract(sum0, 2) + vec_extract(sum0, 3);
-
-    for (int i = n32; i < n; ++i) {
-        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
-    }
 #else
    for (int i = 0; i < n; ++i) {
        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
@ -995,11 +937,11 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_

    for (int i = 0; i < np; i += GGML_F16_STEP) {
        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);

-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay[j]);
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
        }
    }

@ -1008,65 +950,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        GGML_ASSERT(false);
        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
-#elif defined(__POWER9_VECTOR__)
-    // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
-    //       being able to test it. hoping someone with access to a POWER9 machine can help out here.
-    const int n32 = (n & ~31);
-    for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
-        // Use vec_xl, not vec_ld, because x is sometimes unaligned!
-        vector unsigned short x0 = vec_xl(j +  0, x);
-        vector unsigned short x1 = vec_xl(j + 16, x);
-        vector unsigned short x2 = vec_xl(j + 32, x);
-        vector unsigned short x3 = vec_xl(j + 48, x);
-
-        vector unsigned short y0 = vec_xl(j +  0, y);
-        vector unsigned short y1 = vec_xl(j + 16, y);
-        vector unsigned short y2 = vec_xl(j + 32, y);
-        vector unsigned short y3 = vec_xl(j + 48, y);
-
-        vector float v4 = vec_splats(v);
-
-        vector float fx0l = vec_extract_fp32_from_shortl(x0);
-        vector float fx0h = vec_extract_fp32_from_shorth(x0);
-        vector float fx1l = vec_extract_fp32_from_shortl(x1);
-        vector float fx1h = vec_extract_fp32_from_shorth(x1);
-        vector float fx2l = vec_extract_fp32_from_shortl(x2);
-        vector float fx2h = vec_extract_fp32_from_shorth(x2);
-        vector float fx3l = vec_extract_fp32_from_shortl(x3);
-        vector float fx3h = vec_extract_fp32_from_shorth(x3);
-
-        vector float fy0l = vec_extract_fp32_from_shortl(y0);
-        vector float fy0h = vec_extract_fp32_from_shorth(y0);
-        vector float fy1l = vec_extract_fp32_from_shortl(y1);
-        vector float fy1h = vec_extract_fp32_from_shorth(y1);
-        vector float fy2l = vec_extract_fp32_from_shortl(y2);
-        vector float fy2h = vec_extract_fp32_from_shorth(y2);
-        vector float fy3l = vec_extract_fp32_from_shortl(y3);
-        vector float fy3h = vec_extract_fp32_from_shorth(y3);
-
-        fy0l = vec_madd(fx0l, v4, fy0l);
-        fy0h = vec_madd(fx0h, v4, fy0h);
-        fy1l = vec_madd(fx1l, v4, fy1l);
-        fy1h = vec_madd(fx1h, v4, fy1h);
-        fy2l = vec_madd(fx2l, v4, fy2l);
-        fy2h = vec_madd(fx2h, v4, fy2h);
-        fy3l = vec_madd(fx3l, v4, fy3l);
-        fy3h = vec_madd(fx3h, v4, fy3h);
-
-        y0 = vec_pack_to_short_fp32(fy0h, fy0l);
-        y1 = vec_pack_to_short_fp32(fy1h, fy1l);
-        y2 = vec_pack_to_short_fp32(fy2h, fy2l);
-        y3 = vec_pack_to_short_fp32(fy3h, fy3l);
-
-        vec_xst(y0, j +  0, y);
-        vec_xst(y1, j + 16, y);
-        vec_xst(y2, j + 32, y);
-        vec_xst(y3, j + 48, y);
-    }
-
-    for (int i = n32; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
 #else
    for (int i = 0; i < n; ++i) {
        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
@ -1334,7 +1217,7 @@ static struct ggml_state g_state;
 static atomic_int g_state_barrier = 0;

 // barrier via spin lock
-inline static void ggml_critical_section_start() {
+inline static void ggml_critical_section_start(void) {
    int processing = atomic_fetch_add(&g_state_barrier, 1);

    while (processing > 0) {
@ -1347,7 +1230,7 @@ inline static void ggml_critical_section_start() {

 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end() {
+inline static void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
 }

@ -8456,4 +8339,12 @@ int ggml_cpu_has_sse3(void) {
 #endif
 }

+int ggml_cpu_has_vsx(void) {
+#if defined(__POWER9_VECTOR__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
--- a/ggml.h
+++ b/ggml.h
@ -732,6 +732,7 @@ int ggml_cpu_has_fp16_va(void);
 int ggml_cpu_has_wasm_simd(void);
 int ggml_cpu_has_blas(void);
 int ggml_cpu_has_sse3(void);
+int ggml_cpu_has_vsx(void);

 #ifdef  __cplusplus
 }
--- a/whisper.cpp
+++ b/whisper.cpp
@ -412,6 +412,8 @@ struct whisper_context {
    std::vector<uint8_t>   buf_compute;
    std::vector<uint8_t>   buf_compute_layer;

+    ggml_type wtype; // weight type (FP32 or FP16)
+
    whisper_model model;
    whisper_vocab vocab;

@ -435,8 +437,7 @@ struct whisper_context {
 };

 template<typename T>
-static void read_safe(std::ifstream& fin, T& dest)
-{
+static void read_safe(std::ifstream& fin, T& dest) {
    fin.read((char*)& dest, sizeof(T));
 }

@ -630,7 +631,9 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

    // for the big tensors, we have the option to store the data in 16-bit floats
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    const ggml_type wtype = wctx.wtype;

    size_t ctx_size = 0;

@ -651,7 +654,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

        // encoder
        {
-            // TODO: F16 .. maybe not?
            ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;

            ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
@ -666,7 +668,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

        // decoder
        {
-            // TODO: F16 .. maybe not?
            ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;

            ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
@ -983,8 +984,8 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            const int n_mem      = n_text_layer*n_text_ctx;
            const int n_elements = n_text_state*n_mem;

-            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_k = ggml_new_tensor_1d(ctx, wtype, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, wtype, n_elements);
        }

        // key/value memory for the cross-attention layer
@ -994,8 +995,8 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            const int n_mem      = n_text_layer*n_audio_ctx;
            const int n_elements = n_text_state*n_mem;

-            model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-            model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_cross_k = ggml_new_tensor_1d(ctx, wtype, n_elements);
+            model.memory_cross_v = ggml_new_tensor_1d(ctx, wtype, n_elements);
        }

        const size_t memory_size =
@ -1241,14 +1242,14 @@ static bool whisper_encode(
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            struct ggml_tensor * K =
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            struct ggml_tensor * V =
@ -1258,7 +1259,7 @@ static bool whisper_encode(
                                Vcur,
                                n_state/n_head, n_head, n_ctx),
                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head)
+                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_ctx, n_state/n_head, n_head)
                        );

            struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
@ -1274,7 +1275,7 @@ static bool whisper_encode(
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            // K * Q
@ -1292,7 +1293,7 @@ static bool whisper_encode(
            //    ggml_permute(ctxL,
            //            ggml_cpy(ctxL,
            //                Vcur,
-            //                ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+            //                ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
            //            1, 2, 0, 3);

            //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
@ -1304,7 +1305,7 @@ static bool whisper_encode(
                                Vcur,
                                n_state/n_head, n_head, n_ctx),
                            0, 2, 1, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head)
+                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_ctx, n_head)
                        );

            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
@ -1349,7 +1350,7 @@ static bool whisper_encode(

 #ifdef USE_FLASH_FF
            cur = ggml_flash_ff(ctxL,
-                    ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, GGML_TYPE_F16, n_state, N)),
+                    ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, wctx.wtype, n_state, N)),
                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
            // fully connected
@ -2473,12 +2474,12 @@ int whisper_lang_auto_detect(
    }

    {
-        for (int i = 0; i < (int) probs_id.size(); i++) {
+        for (const auto & prob : probs_id) {
            if (lang_probs) {
-                lang_probs[probs_id[i].second] = probs_id[i].first;
+                lang_probs[prob.second] = prob.first;
            }

-            //printf("%s: lang %2d (%3s): %f\n", __func__, probs_id[i].second, whisper_lang_str(probs_id[i].second), probs_id[i].first);
+            //printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
        }
    }

@ -2582,6 +2583,7 @@ const char * whisper_print_system_info(void) {
    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";

    return s.c_str();
 }
@ -3158,7 +3160,7 @@ int whisper_full_parallel(

        // separate key + value memory for each processor
        {
-            auto & ctx = model.ctx_mem;
+            auto & mctx = model.ctx_mem;

            const auto & hparams = model.hparams;

@ -3171,8 +3173,8 @@ int whisper_full_parallel(
                const int n_mem      = n_text_layer*n_text_ctx;
                const int n_elements = n_text_state*n_mem;

-                model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-                model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+                model.memory_k = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
+                model.memory_v = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
            }

            // key/value memory for the cross-attention layer
@ -3182,8 +3184,8 @@ int whisper_full_parallel(
                const int n_mem      = n_text_layer*n_audio_ctx;
                const int n_elements = n_text_state*n_mem;

-                model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-                model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+                model.memory_cross_k = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
+                model.memory_cross_v = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
            }
        }
    }
@ -3227,17 +3229,17 @@ int whisper_full_parallel(
    for (int i = 0; i < n_processors - 1; ++i) {
        auto & results_i = ctxs[i].result_all;

-        for (int j = 0; j < (int) results_i.size(); ++j) {
+        for (auto & result : results_i) {
            // correct the segment timestamp taking into account the offset
-            results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
-            results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            result.t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            result.t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;

            // make sure that segments are not overlapping
            if (!ctx->result_all.empty()) {
-                results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
+                result.t0 = std::max(result.t0, ctx->result_all.back().t1);
            }

-            ctx->result_all.push_back(std::move(results_i[j]));
+            ctx->result_all.push_back(std::move(result));

            // call the new_segment_callback for each segment
            if (params.new_segment_callback) {
@ -3332,18 +3334,18 @@ static int64_t sample_to_timestamp(int i_sample) {
 static float voice_length(const std::string & text) {
    float res = 0.0f;

-    for (size_t i = 0; i < text.size(); ++i) {
-        if (text[i] == ' ') {
+    for (char c : text) {
+        if (c == ' ') {
            res += 0.01f;
-        } else if (text[i] == ',') {
+        } else if (c == ',') {
            res += 2.00f;
-        } else if (text[i] == '.') {
+        } else if (c == '.') {
            res += 3.00f;
-        } else if (text[i] == '!') {
+        } else if (c == '!') {
            res += 3.00f;
-        } else if (text[i] == '?') {
+        } else if (c == '?') {
            res += 3.00f;
-        } else if (text[i] >= '0' && text[i] <= '9') {
+        } else if (c >= '0' && c <= '9') {
            res += 3.00f;
        } else {
            res += 1.00f;