examples : refactor in order to reuse code and reduce duplication (#482)

* examples : refactor common code into a library * examples : refactor common SDL code into a library * make : update Makefile to use common libs * common : fix MSVC M_PI .. * addon.node : link common lib
2 years ago · 09d7d2b68e
parent 0336161b7d
commit 09d7d2b68e
19 changed files with 580 additions and 1254 deletions
--- a/19
+++ b/19
@ -197,18 +197,21 @@ clean:
 CC_SDL=`sdl2-config --cflags --libs`
-main: examples/main/main.cpp ggml.o whisper.o
+SRC_COMMON = examples/common.cpp
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
+SRC_COMMON_SDL = examples/common-sdl.cpp
 main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h
-stream: examples/stream/stream.cpp ggml.o whisper.o
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp ggml.o whisper.o
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp  examples/talk/gpt-2.cpp ggml.o whisper.o
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,6 +14,37 @@ if (WHISPER_SUPPORT_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()
 # common
 set(TARGET common)
 add_library(${TARGET} STATIC
    common.h
    common.cpp
    )
 include(DefaultTargetOptions)
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (WHISPER_SUPPORT_SDL2)
    # common-sdl
    set(TARGET common-sdl)
    add_library(${TARGET} STATIC
        common-sdl.h
        common-sdl.cpp
        )
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -23,7 +23,7 @@ string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
 target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================
-target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
    # Generate node.lib
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -1,15 +1,13 @@
-#include <cstdint>
+#include "napi.h"
 #include "common.h"
 #include "whisper.h"
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
-
+#include <cstdint>
 #include "napi.h"
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include "whisper.h"
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -44,7 +42,7 @@ struct whisper_params {
    std::string model    = "../../ggml-large.bin";
    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_outp = {};
+    std::vector<std::string> fname_out = {};
 };
 struct whisper_print_user_data {
@ -143,7 +141,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
 }
 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        return 2;
@ -181,91 +178,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-        const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
+        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-        // WAV input
+        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-        {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            drwav wav;
+            continue;
            std::vector<uint8_t> wav_data; // used for pipe input from stdin
            if (fname_inp == "-") {
                {
                    uint8_t buf[1024];
                    while (true)
                    {
                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
                        if (n == 0) {
                            break;
                        }
                        wav_data.insert(wav_data.end(), buf, buf + n);
                    }
                }
                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }
                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
                return 5;
            }
           if (wav.channels != 1 && wav.channels != 2) {
               fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
               return 6;
           }
           if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
               fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
               return 6;
           }
           if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
               fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
               return 8;
           }
           if (wav.bitsPerSample != 16) {
               fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str());
               return 9;
           }
            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
            if (params.diarize) {
                // convert to stereo, float
                pcmf32s.resize(2);
                pcmf32s[0].resize(n);
                pcmf32s[1].resize(n);
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
                }
            }
        }
        // print system information
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -11,6 +11,7 @@ add_executable(${TARGET}
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE
    common
    whisper
    )
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
 #include "common.h"
 #include "whisper.h"
 #include <emscripten.h>
@ -27,24 +28,6 @@ std::string g_transcribed   = "";
 std::vector<float> g_pcmf32;
 static std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
 }
 static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
@ -75,44 +58,6 @@ void command_set_status(const std::string & status) {
    g_status = status;
 }
 bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (size_t i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
 std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();
@ -155,7 +100,7 @@ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
    const int64_t n_samples = (ms * sample_rate) / 1000;
    int64_t n_take = 0;
-    if (g_pcmf32.size() < n_samples) {
+    if (n_samples > (int) g_pcmf32.size()) {
        n_take = g_pcmf32.size();
    } else {
        n_take = n_samples;
@ -187,7 +132,6 @@ void command_main(size_t index) {
    printf("command: using %d threads\n", wparams.n_threads);
    bool is_running   = true;
    bool have_prompt  = false;
    bool ask_prompt   = true;
    bool print_energy = false;
@ -233,7 +177,7 @@ void command_main(size_t index) {
        {
            command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
-            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                command_set_status("Speech detected! Processing ...");
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
    include(DefaultTargetOptions)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -6,11 +6,10 @@
 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <sstream>
 #include <cassert>
 #include <cstdio>
@ -110,309 +109,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    bool       m_running = false;
    std::mutex m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 ///////////////////////////
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
 }
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();
@ -502,7 +198,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
    std::string line;
    while (std::getline(ifs, line)) {
-        line = trim(line);
+        line = ::trim(line);
        if (line.empty()) {
            continue;
        }
@ -526,23 +222,6 @@ std::vector<std::string> get_words(const std::string &txt) {
    return words;
 }
 // returns true if no exit event was received
 bool process_sdl_events() {
    SDL_Event event;
    while (SDL_PollEvent(&event)) {
        switch (event.type) {
            case SDL_QUIT:
                {
                    return false;
                } break;
            default:
                break;
        }
    }
    return true;
 }
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
@ -634,14 +313,14 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = process_sdl_events();
+        is_running = sdl_poll_events();
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        audio.get(2000, pcmf32_cur);
-        if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+        if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
            fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
            const auto t_start = std::chrono::high_resolution_clock::now();
@ -775,7 +454,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = process_sdl_events();
+        is_running = sdl_poll_events();
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -791,7 +470,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
        {
            audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                int64_t t_ms = 0;
@ -854,7 +533,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = process_sdl_events();
+        is_running = sdl_poll_events();
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -870,7 +549,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
        {
            audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                int64_t t_ms = 0;
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -0,0 +1,226 @@
 #include "common-sdl.h"
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
    m_running = false;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 bool sdl_poll_events() {
    SDL_Event event;
    while (SDL_PollEvent(&event)) {
        switch (event.type) {
            case SDL_QUIT:
                {
                    return false;
                } break;
            default:
                break;
        }
    }
    return true;
 }
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -0,0 +1,50 @@
 #pragma once
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <atomic>
 #include <cstdint>
 #include <vector>
 #include <mutex>
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    std::atomic_bool m_running;
    std::mutex       m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 // Return false if need to quit
 bool sdl_poll_events();
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -0,0 +1,162 @@
 #include "common.h"
 // third-party utilities
 // use your favorite implementations
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include <cmath>
 #include <regex>
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
 }
 std::string replace(const std::string & s, const std::string & from, const std::string & to) {
    std::string result = s;
    size_t pos = 0;
    while ((pos = result.find(from, pos)) != std::string::npos) {
        result.replace(pos, from.length(), to);
        pos += to.length();
    }
    return result;
 }
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
    if (fname == "-") {
        {
            uint8_t buf[1024];
            while (true)
            {
                const size_t n = fread(buf, 1, sizeof(buf), stdin);
                if (n == 0) {
                    break;
                }
                wav_data.insert(wav_data.end(), buf, buf + n);
            }
        }
        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
            fprintf(stderr, "error: failed to open WAV file from stdin\n");
            return false;
        }
        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
    }
    if (wav.channels != 1 && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
        return false;
    }
    if (stereo && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
        return false;
    }
    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
        return false;
    }
    if (wav.bitsPerSample != 16) {
        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
        return false;
    }
    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
    std::vector<int16_t> pcm16;
    pcm16.resize(n*wav.channels);
    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
    drwav_uninit(&wav);
    // convert to mono, float
    pcmf32.resize(n);
    if (wav.channels == 1) {
        for (uint64_t i = 0; i < n; i++) {
            pcmf32[i] = float(pcm16[i])/32768.0f;
        }
    } else {
        for (uint64_t i = 0; i < n; i++) {
            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
        }
    }
    if (stereo) {
        // convert to stereo, float
        pcmf32s.resize(2);
        pcmf32s[0].resize(n);
        pcmf32s[1].resize(n);
        for (uint64_t i = 0; i < n; i++) {
            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
        }
    }
    return true;
 }
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -0,0 +1,40 @@
 #pragma once
 // needs to match WHISPER_SAMPLE_RATE
 #define COMMON_SAMPLE_RATE 16000
 #include <vector>
 #include <string>
 std::string trim(const std::string & s);
 std::string replace(
        const std::string & s,
        const std::string & from,
        const std::string & to);
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(
        const std::string & fname,
        std::vector<float> & pcmf32,
        std::vector<std::vector<float>> & pcmf32s,
        bool stereo);
 // Apply a high-pass frequency filter to PCM audio
 // Suppresses frequencies below cutoff Hz
 void high_pass_filter(
        std::vector<float> & data,
        float cutoff,
        float sample_rate);
 // Basic voice activity detection (VAD) using audio energy adaptive threshold
 bool vad_simple(
        std::vector<float> & pcmf32,
        int   sample_rate,
        int   last_ms,
        float vad_thold,
        float freq_thold,
        bool  verbose);
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)
 include(DefaultTargetOptions)
-target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,9 +1,6 @@
-#include "whisper.h"
+#include "common.h"
-// third-party utilities
+#include "whisper.h"
 // use your favorite implementations
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include <cmath>
 #include <fstream>
@ -86,7 +83,7 @@ struct whisper_params {
    std::string model    = "models/ggml-base.en.bin";
    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_outp = {};
+    std::vector<std::string> fname_out = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -126,7 +123,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-of"   || arg == "--output-file")    { params.fname_outp.emplace_back(argv[++i]); }
+        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@ -520,91 +517,14 @@ int main(int argc, char ** argv) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
+		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
-        std::vector<float> pcmf32; // mono-channel F32 PCM
+        std::vector<float> pcmf32;               // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-        // WAV input
+        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-        {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            drwav wav;
+            continue;
            std::vector<uint8_t> wav_data; // used for pipe input from stdin
            if (fname_inp == "-") {
                {
                    uint8_t buf[1024];
                    while (true)
                    {
                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
                        if (n == 0) {
                            break;
                        }
                        wav_data.insert(wav_data.end(), buf, buf + n);
                    }
                }
                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }
                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
                return 5;
            }
            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
                return 6;
            }
            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
                return 6;
            }
            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
                return 8;
            }
            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
                return 9;
            }
            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
            if (params.diarize) {
                // convert to stereo, float
                pcmf32s.resize(2);
                pcmf32s[0].resize(n);
                pcmf32s[1].resize(n);
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
                }
            }
        }
        // print system information
@ -701,34 +621,33 @@ int main(int argc, char ** argv) {
            // output to text file
            if (params.output_txt) {
-                const auto fname_txt = fname_outp + ".txt";
+                const auto fname_txt = fname_out + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }
            // output to VTT file
            if (params.output_vtt) {
-                const auto fname_vtt = fname_outp + ".vtt";
+                const auto fname_vtt = fname_out + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }
            // output to SRT file
            if (params.output_srt) {
-                const auto fname_srt = fname_outp + ".srt";
+                const auto fname_srt = fname_out + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }
            // output to WTS file
            if (params.output_wts) {
-                const auto fname_wts = fname_outp + ".wts";
+                const auto fname_wts = fname_out + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
-	    // output to CSV file
+            // output to CSV file
            if (params.output_csv) {
-                const auto fname_csv = fname_outp + ".csv";
+                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
        }
    }
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
    include(DefaultTargetOptions)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -3,19 +3,16 @@
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
 #include <mutex>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -116,306 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    std::atomic_bool m_running;
    std::mutex       m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
    m_running = false;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 ///////////////////////////
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
 int main(int argc, char ** argv) {
    whisper_params params;
@ -426,10 +123,10 @@ int main(int argc, char ** argv) {
    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
    params.length_ms = std::max(params.length_ms, params.step_ms);
-    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = (params.keep_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s  = (30000           *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
@ -517,23 +214,7 @@ int main(int argc, char ** argv) {
    // main audio loop
    while (is_running) {
        // handle Ctrl + C
-        {
+        is_running = sdl_poll_events();
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
                switch (event.type) {
                    case SDL_QUIT:
                        {
                            is_running = false;
                        } break;
                    default:
                        break;
                }
            }
            if (!is_running) {
                break;
            }
        }
        if (!is_running) {
            break;
@ -556,7 +237,7 @@ int main(int argc, char ** argv) {
                    break;
                }
-                SDL_Delay(1);
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
            }
            const int n_samples_new = pcmf32_new.size();
@ -587,7 +268,7 @@ int main(int argc, char ** argv) {
            audio.get(2000, pcmf32_new);
-            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                audio.get(params.length_ms, pcmf32);
            } else {
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -7,7 +7,7 @@ if (WHISPER_SUPPORT_SDL2)
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
    include(DefaultTargetOptions)
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -1,16 +1,14 @@
 // Talk with AI
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
 #include "gpt-2.h"
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
 #include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@ -105,320 +103,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    bool       m_running = false;
    std::mutex m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
        fprintf(stderr, "\n");
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 ///////////////////////////
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
 }
 std::string replace(const std::string & s, const std::string & from, const std::string & to) {
    std::string result = s;
    size_t pos = 0;
    while ((pos = result.find(from, pos)) != std::string::npos) {
        result.replace(pos, from.length(), to);
        pos += to.length();
    }
    return result;
 }
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();
@ -557,22 +241,10 @@ int main(int argc, char ** argv) {
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        {
+        is_running = sdl_poll_events();
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
                switch (event.type) {
                    case SDL_QUIT:
                        {
                            is_running = false;
                        } break;
                    default:
                        break;
                }
            }
-            if (!is_running) {
+        if (!is_running) {
-                break;
+            break;
            }
        }
        // delay
@ -583,7 +255,7 @@ int main(int argc, char ** argv) {
        {
            audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                audio.get(params.voice_ms, pcmf32_cur);
`@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)`

	`include(DefaultTargetOptions)`	`include(DefaultTargetOptions)`

	`target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})`	`target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})`