Merge branch 'ggerganov:master' into master

3 years ago · c1808cd641
parent fc0e984846 3c390ffe38
commit c1808cd641
31 changed files with 2153 additions and 832 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,10 +13,12 @@ build-sanitize-thread/
 main
 stream
 command
 bench
 sync.sh
 compile_commands.json
 examples/arm_neon.h
 examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
 examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
--- a/7
+++ b/7
@ -134,7 +134,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
 clean:
-	rm -f *.o main stream bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command bench libwhisper.a libwhisper.so
 #
 # Examples
@ -149,6 +149,9 @@ main: examples/main/main.cpp ggml.o whisper.o
 stream: examples/stream/stream.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
 command: examples/command/command.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
@ -198,7 +201,7 @@ tiny.en tiny base.en base small.en small medium.en medium large: main
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
-		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+		echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-$@.bin -f $$f ; \
--- a/README.md
+++ b/README.md
@ -36,9 +36,11 @@ As an example, here is a video of running the model on an iPhone 13 device - ful
 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
-Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
+You can also easily make your own offline voice assistant application:
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
-https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4
+Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 ## Implementation details
@ -100,27 +102,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...
 options:
-  -h,       --help           show this help message and exit
+  -h,       --help          [default] show this help message and exit
-  -s SEED,  --seed SEED      RNG seed (default: -1)
+  -t N,     --threads N     [4      ] number of threads to use during computation
-  -t N,     --threads N      number of threads to use during computation (default: 4)
+  -p N,     --processors N  [1      ] number of processors to use during computation
-  -p N,     --processors N   number of processors to use during computation (default: 1)
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
+  -on N,    --offset-n N    [0      ] segment index offset
-  -on N,    --offset-n N     segment index offset (default: 0)
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
-  -v,       --verbose        verbose output
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
-            --translate      translate from source language to english
+  -tr,      --translate     [false  ] translate from source language to english
-  -otxt,    --output-txt     output result in a text file
+  -otxt,    --output-txt    [false  ] output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
+  -owts,    --output-words  [false  ] output script for generating karaoke video
-  -ps,      --print_special  print special tokens
+  -ps,      --print-special [false  ] print special tokens
-  -pc,      --print_colors   print colors
+  -pc,      --print-colors  [false  ] print colors
-  -nt,      --no_timestamps  do not print timestamps
+  -nt,      --no-timestamps [true   ] do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
+  -l LANG,  --language LANG [en     ] spoken language
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
-  -f FNAME, --file FNAME     input WAV file path
+  -f FNAME, --file FNAME    [       ] input WAV file path
 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -152,13 +154,13 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
 whisper_model_load: mem_required  = 670.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 140.60 MB
+whisper_model_load: mem_required  =  506.00 MB
-whisper_model_load: memory size =    22.83 MB
+whisper_model_load: ggml ctx size =  140.60 MB
-whisper_model_load: model size  =   140.54 MB
+whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -6,10 +6,16 @@
 #include <vector>
 #include <thread>
 std::thread g_worker;
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
                g_contexts[i] = whisper_init(path_model.c_str());
@ -25,6 +31,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));
    emscripten::function("free", emscripten::optional_override([](size_t index) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
        --index;
        if (index < g_contexts.size()) {
@ -34,6 +44,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));
    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
        --index;
        if (index >= g_contexts.size()) {
@ -46,14 +60,14 @@ EMSCRIPTEN_BINDINGS(whisper) {
        struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-        params.print_realtime       = true;
+        params.print_realtime   = true;
-        params.print_progress       = false;
+        params.print_progress   = false;
-        params.print_timestamps     = true;
+        params.print_timestamps = true;
-        params.print_special_tokens = false;
+        params.print_special    = false;
-        params.translate            = translate;
+        params.translate        = translate;
-        params.language             = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
+        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads            = std::min(8, (int) std::thread::hardware_concurrency());
+        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
-        params.offset_ms            = 0;
+        params.offset_ms        = 0;
        std::vector<float> pcmf32;
        const int n = audio["length"].as<int>();
@ -80,10 +94,15 @@ EMSCRIPTEN_BINDINGS(whisper) {
            printf("\n");
        }
-        int ret = whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
+        // run the worker
-
+        {
-        whisper_print_timings(g_contexts[index]);
+            g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
                whisper_reset_timings(g_contexts[index]);
                whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
                whisper_print_timings(g_contexts[index]);
            });
        }
-        return ret;
+        return 0;
    }));
 }
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -20,9 +20,11 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
    add_subdirectory(stream.wasm)
    add_subdirectory(talk.wasm)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
 endif()
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -6,9 +6,9 @@
 // command-line parameters
 struct whisper_params {
-    int32_t n_threads   = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    std::string model     = "models/ggml-base.en.bin";
+    std::string model = "models/ggml-base.en.bin";
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -17,14 +17,13 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
-        if (arg == "-t" || arg == "--threads") {
+        if (arg == "-h" || arg == "--help") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -39,9 +38,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -0,0 +1,7 @@
 if (WHISPER_SUPPORT_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -0,0 +1,28 @@
 # command
 This is a basic Voice Assistant example that accepts voice commands from the microphone.
 More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
 ```bash
 # Run with default arguments and small model
 ./command -m ./models/ggml-small.en.bin -t 8
 # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
 ./command -m ./models/ggml-tiny.en.bin -ac 768 -t 4 -c 0
 ```
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 ## Building
 The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
 # Install SDL2 on Linux
 sudo apt-get install libsdl2-dev
 # Install SDL2 on Mac OS
 brew install sdl2
 make command
 ```
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -0,0 +1,655 @@
 // Voice assistant example
 //
 // Speak short text commands to the microphone.
 // This program will detect your voice command and convert them to text.
 //
 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //
 #include "whisper.h"
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
 #include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
 #include <vector>
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t prompt_ms  = 5000;
    int32_t command_ms = 4000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool speed_up      = false;
    bool translate     = false;
    bool no_context    = true;
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::string fname_out = "";
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -pms N,   --prompt-ms N   [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
    fprintf(stderr, "  -cms N,   --command-ms N  [%-7d] command duration in milliseconds\n",            params.command_ms);
    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    bool       m_running = false;
    std::mutex m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 ///////////////////////////
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
 }
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);
    float y = data[0];
    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
 }
 bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;
    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }
    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }
    float energy_all  = 0.0f;
    float energy_last = 0.0f;
    for (size_t i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);
        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }
    energy_all  /= n_samples;
    energy_last /= n_samples_last;
    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }
    if (energy_last > vad_thold*energy_all) {
        return false;
    }
    return true;
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();
    prob = 0.0f;
    t_ms = 0;
    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }
    int prob_n = 0;
    std::string result;
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        result += text;
        const int n_tokens = whisper_full_n_tokens(ctx, i);
        for (int j = 0; j < n_tokens; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);
            prob += token.p;
            ++prob_n;
        }
    }
    if (prob_n > 0) {
        prob /= prob_n;
    }
    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
    return result;
 }
 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
    const size_t len1 = s1.size() + 1;
    std::vector<int> col(len1, 0);
    std::vector<int> prevCol(len1, 0);
    for (size_t i = 0; i < len1; i++) {
        prevCol[i] = i;
    }
    for (size_t i = 0; i < len0; i++) {
        col[0] = i;
        for (size_t j = 1; j < len1; j++) {
            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
        }
        col.swap(prevCol);
    }
    const float dist = prevCol[len1 - 1];
    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
    // print some info about the processing
    {
        fprintf(stderr, "\n");
        if (!whisper_is_multilingual(ctx)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);
        fprintf(stderr, "\n");
    }
    // init audio
    audio_async audio(30*1000);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }
    audio.resume();
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt = true;
    float prob0 = 0.0f;
    float prob  = 0.0f;
    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;
    const std::string k_prompt = "Ok Whisper, start listening for commands.";
    // main loop
    while (is_running) {
        // handle Ctrl + C
        {
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
                switch (event.type) {
                    case SDL_QUIT:
                        {
                            is_running = false;
                        } break;
                    default:
                        break;
                }
            }
            if (!is_running) {
                break;
            }
        }
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        if (ask_prompt) {
            fprintf(stdout, "\n");
            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
            fprintf(stdout, "\n");
            ask_prompt = false;
        }
        int64_t t_ms = 0;
        {
            audio.get(2000, pcmf32_cur);
            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                if (!have_prompt) {
                    audio.get(params.prompt_ms, pcmf32_cur);
                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
                    const float sim = similarity(txt, k_prompt);
                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
                        ask_prompt = true;
                    } else {
                        fprintf(stdout, "\n");
                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
                        fprintf(stdout, "\n");
                        // save the audio for the prompt
                        pcmf32_prompt = pcmf32_cur;
                        have_prompt = true;
                    }
                } else {
                    audio.get(params.command_ms, pcmf32_cur);
                    // prepend the prompt audio
                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
                    prob = 100.0f*(prob - prob0);
                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
                    // find the prompt in the text
                    float best_sim = 0.0f;
                    size_t best_len = 0;
                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
                        const auto prompt = txt.substr(0, n);
                        const float sim = similarity(prompt, k_prompt);
                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
                        if (sim > best_sim) {
                            best_sim = sim;
                            best_len = n;
                        }
                    }
                    const std::string command = ::trim(txt.substr(best_len));
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                    fprintf(stdout, "\n");
                }
                audio.clear();
            }
        }
    }
    audio.pause();
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -0,0 +1,182 @@
 // Common Javascript functions used by the examples
 function convertTypedArray(src, type) {
    var buffer = new ArrayBuffer(src.byteLength);
    var baseView = new src.constructor(buffer).set(src);
    return new type(buffer);
 }
 var printTextarea = (function() {
    var element = document.getElementById('output');
    if (element) element.alue = ''; // clear browser cache
    return function(text) {
        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
        console.log(text);
        if (element) {
            element.value += text + "\n";
            element.scrollTop = element.scrollHeight; // focus on bottom
        }
    };
 })();
 async function clearCache() {
    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
        indexedDB.deleteDatabase(dbName);
    }
 }
 // fetch a remote file from remote URL using the Fetch API
 async function fetchRemote(url, cbProgress, cbPrint) {
    cbPrint('fetchRemote: downloading with fetch()...');
    const response = await fetch(
        url,
        {
            method: 'GET',
            headers: {
                'Content-Type': 'application/octet-stream',
            },
        }
    );
    if (!response.ok) {
        cbPrint('fetchRemote: failed to fetch ' + url);
        return;
    }
    const contentLength = response.headers.get('content-length');
    const total = parseInt(contentLength, 10);
    const reader = response.body.getReader();
    var chunks = [];
    var receivedLength = 0;
    var progressLast = -1;
    while (true) {
        const { done, value } = await reader.read();
        if (done) {
            break;
        }
        chunks.push(value);
        receivedLength += value.length;
        if (contentLength) {
            cbProgress(receivedLength/total);
            var progressCur = Math.round((receivedLength / total) * 10);
            if (progressCur != progressLast) {
                cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
                progressLast = progressCur;
            }
        }
    }
    var position = 0;
    var chunksAll = new Uint8Array(receivedLength);
    for (var chunk of chunks) {
        chunksAll.set(chunk, position);
        position += chunk.length;
    }
    return chunksAll;
 }
 // load remote data
 // - check if the data is already in the IndexedDB
 // - if not, fetch it from the remote URL and store it in the IndexedDB
 function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
    // query the storage quota and print it
    navigator.storage.estimate().then(function (estimate) {
        cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
        cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
    });
    // check if the data is already in the IndexedDB
    var rq = indexedDB.open(dbName, dbVersion);
    rq.onupgradeneeded = function (event) {
        var db = event.target.result;
        if (db.version == 1) {
            var os = db.createObjectStore('models', { autoIncrement: false });
            cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
        } else {
            // clear the database
            var os = event.currentTarget.transaction.objectStore('models');
            os.clear();
            cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
        }
    };
    rq.onsuccess = function (event) {
        var db = event.target.result;
        var tx = db.transaction(['models'], 'readonly');
        var os = tx.objectStore('models');
        var rq = os.get(url);
        rq.onsuccess = function (event) {
            if (rq.result) {
                cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
                cbReady(dst, rq.result);
            } else {
                // data is not in the IndexedDB
                cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
                // alert and ask the user to confirm
                if (!confirm(
                    'You are about to download ' + size_mb + ' MB of data.\n' +
                    'The model data will be cached in the browser for future use.\n\n' +
                    'Press OK to continue.')) {
                    cbCancel();
                    return;
                }
                fetchRemote(url, cbProgress, cbPrint).then(function (data) {
                    if (data) {
                        // store the data in the IndexedDB
                        var rq = indexedDB.open(dbName, dbVersion);
                        rq.onsuccess = function (event) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
                            var rq = os.put(data, url);
                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
                                cbReady(dst, data);
                            };
                            rq.onerror = function (event) {
                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
                                cbCancel();
                            };
                        };
                    }
                });
            }
        };
        rq.onerror = function (event) {
            cbPrint('loadRemote: failed to get data from the IndexedDB');
            cbCancel();
        };
    };
    rq.onerror = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB');
        cbCancel();
    };
    rq.onblocked = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: blocked');
        cbCancel();
    };
    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
    };
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -6,29 +6,28 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 ```
 ./main -h
-usage: ./bin/main [options] file0.wav file1.wav ...
+usage: ./main [options] file0.wav file1.wav ...
  -h,       --help           show this help message and exit
  -s SEED,  --seed SEED      RNG seed (default: -1)
  -t N,     --threads N      number of threads to use during computation (default: 4)
  -p N,     --processors N   number of processors to use during computation (default: 1)
  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
  -on N,    --offset-n N     segment index offset (default: 0)
  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
  -ml N,    --max-len N      maximum segment length in characters (default: 0)
  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
  -v,       --verbose        verbose output
            --translate      translate from source language to english
  -otxt,    --output-txt     output result in a text file
  -ovtt,    --output-vtt     output result in a vtt file
  -osrt,    --output-srt     output result in a srt file
  -owts,    --output-words   output script for generating karaoke video
  -ps,      --print_special  print special tokens
  -pc,      --print_colors   print colors
  -nt,      --no_timestamps  do not print timestamps
  -l LANG,  --language LANG  spoken language (default: en)
  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
  -f FNAME, --file FNAME     input WAV file path
  -h,       --help           show this help message and exit
 options:
  -h,       --help          [default] show this help message and exit
  -t N,     --threads N     [4      ] number of threads to use during computation
  -p N,     --processors N  [1      ] number of processors to use during computation
  -ot N,    --offset-t N    [0      ] time offset in milliseconds
  -on N,    --offset-n N    [0      ] segment index offset
  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
  -ml N,    --max-len N     [0      ] maximum segment length in characters
  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
  -tr,      --translate     [false  ] translate from source language to english
  -otxt,    --output-txt    [false  ] output result in a text file
  -ovtt,    --output-vtt    [false  ] output result in a vtt file
  -osrt,    --output-srt    [false  ] output result in a srt file
  -owts,    --output-words  [false  ] output script for generating karaoke video
  -ps,      --print-special [false  ] print special tokens
  -pc,      --print-colors  [false  ] print colors
  -nt,      --no-timestamps [true   ] do not print timestamps
  -l LANG,  --language LANG [en     ] spoken language
  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
  -f FNAME, --file FNAME    [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -36,6 +36,10 @@ std::string to_timestamp(int64_t t, bool comma = false) {
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
 // helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
@ -48,7 +52,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
    int32_t seed         = -1; // RNG seed, not used currently
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
@ -59,16 +62,16 @@ struct whisper_params {
    float word_thold = 0.01f;
-    bool speed_up             = false;
+    bool speed_up      = false;
-    bool verbose              = false;
+    bool translate     = false;
-    bool translate            = false;
+    bool diarize       = false;
-    bool output_txt           = false;
+    bool output_txt    = false;
-    bool output_vtt           = false;
+    bool output_vtt    = false;
-    bool output_srt           = false;
+    bool output_srt    = false;
-    bool output_wts           = false;
+    bool output_wts    = false;
-    bool print_special_tokens = false;
+    bool print_special = false;
-    bool print_colors         = false;
+    bool print_colors  = false;
-    bool no_timestamps        = false;
+    bool no_timestamps = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -87,59 +90,32 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            continue;
        }
-        if (arg == "-s" || arg == "--seed") {
+        if (arg == "-h" || arg == "--help") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--processors") {
            params.n_processors = std::stoi(argv[++i]);
        } else if (arg == "-ot" || arg == "--offset-t") {
            params.offset_t_ms = std::stoi(argv[++i]);
        } else if (arg == "-on" || arg == "--offset-n") {
            params.offset_n = std::stoi(argv[++i]);
        } else if (arg == "-d" || arg == "--duration") {
            params.duration_ms = std::stoi(argv[++i]);
        } else if (arg == "-mc" || arg == "--max-context") {
            params.max_context = std::stoi(argv[++i]);
        } else if (arg == "-ml" || arg == "--max-len") {
            params.max_len = std::stoi(argv[++i]);
        } else if (arg == "-wt" || arg == "--word-thold") {
            params.word_thold = std::stof(argv[++i]);
        } else if (arg == "-su" || arg == "--speed-up") {
            params.speed_up = true;
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
            params.translate = true;
        } else if (arg == "-l" || arg == "--language") {
            params.language = argv[++i];
            if (whisper_lang_id(params.language.c_str()) == -1) {
                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
                whisper_print_usage(argc, argv, params);
                exit(0);
            }
        } else if (arg == "-otxt" || arg == "--output-txt") {
            params.output_txt = true;
        } else if (arg == "-ovtt" || arg == "--output-vtt") {
            params.output_vtt = true;
        } else if (arg == "-osrt" || arg == "--output-srt") {
            params.output_srt = true;
        } else if (arg == "-owts" || arg == "--output-words") {
            params.output_wts = true;
        } else if (arg == "-ps" || arg == "--print_special") {
            params.print_special_tokens = true;
        } else if (arg == "-pc" || arg == "--print_colors") {
            params.print_colors = true;
        } else if (arg == "-nt" || arg == "--no_timestamps") {
            params.no_timestamps = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
            params.fname_inp.push_back(argv[++i]);
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (arg == "-p"    || arg == "--processors")    { params.n_processors  = std::stoi(argv[++i]); }
        else if (arg == "-ot"   || arg == "--offset-t")      { params.offset_t_ms   = std::stoi(argv[++i]); }
        else if (arg == "-on"   || arg == "--offset-n")      { params.offset_n      = std::stoi(argv[++i]); }
        else if (arg == "-d"    || arg == "--duration")      { params.duration_ms   = std::stoi(argv[++i]); }
        else if (arg == "-mc"   || arg == "--max-context")   { params.max_context   = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")       { params.max_len       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")    { params.word_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-di"   || arg == "--diarize")       { params.diarize       = true; }
        else if (arg == "-otxt" || arg == "--output-txt")    { params.output_txt    = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")    { params.output_vtt    = true; }
        else if (arg == "-osrt" || arg == "--output-srt")    { params.output_srt    = true; }
        else if (arg == "-owts" || arg == "--output-words")  { params.output_wts    = true; }
        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pc"   || arg == "--print-colors")  { params.print_colors  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps") { params.no_timestamps = true; }
        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")          { params.fname_inp.push_back(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -154,34 +130,40 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N  [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N    [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N    [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N    [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
+    fprintf(stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
+    fprintf(stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -su,      --speed-up       speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
+    fprintf(stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
+    fprintf(stderr, "  -otxt,    --output-txt    [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
+    fprintf(stderr, "  -ovtt,    --output-vtt    [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
+    fprintf(stderr, "  -osrt,    --output-srt    [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
+    fprintf(stderr, "  -owts,    --output-words  [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
+    fprintf(stderr, "  -pc,      --print-colors  [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
+    fprintf(stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
    fprintf(stderr, "\n");
 }
 struct whisper_print_user_data {
    const whisper_params * params;
    const std::vector<std::vector<float>> * pcmf32s;
 };
 void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
-    const whisper_params & params = *(whisper_params *) user_data;
+    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
    const int n_segments = whisper_full_n_segments(ctx);
@ -195,7 +177,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
        if (params.no_timestamps) {
            if (params.print_colors) {
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -218,10 +200,37 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
            std::string speaker = "";
            if (params.diarize && pcmf32s.size() == 2) {
                const int64_t n_samples = pcmf32s[0].size();
                const int64_t is0 = timestamp_to_sample(t0, n_samples);
                const int64_t is1 = timestamp_to_sample(t1, n_samples);
                double energy0 = 0.0f;
                double energy1 = 0.0f;
                for (int64_t j = is0; j < is1; j++) {
                    energy0 += fabs(pcmf32s[0][j]);
                    energy1 += fabs(pcmf32s[1][j]);
                }
                if (energy0 > 1.1*energy1) {
                    speaker = "(speaker 0)";
                } else if (energy1 > 1.1*energy0) {
                    speaker = "(speaker 1)";
                } else {
                    speaker = "(speaker ?)";
                }
                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
            }
            if (params.print_colors) {
                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -233,13 +242,13 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);
-                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
            }
        }
    }
@ -267,7 +276,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return 9;
+        return false;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -432,16 +441,18 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (params.seed < 0) {
        params.seed = time(NULL);
    }
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 2;
    }
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -454,53 +465,60 @@ int main(int argc, char ** argv) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
        // WAV input
        std::vector<float> pcmf32;
        {
            drwav wav;
            std::vector<uint8_t> wav_data; // used for pipe input from stdin
            if (fname_inp == "-") {
                std::vector<uint8_t> wav_data;
                {
                    uint8_t buf[1024];
                    while (true)
                    {
                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                        if (n == 0)
+                        if (n == 0) {
                        {
                            break;
                        }
                        wav_data.insert(wav_data.end(), buf, buf + n);
                    }
                }
-                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false)
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false) {
                {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }
                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
            else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-                return 4;
+                return 5;
            }
            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
-                return 5;
+                return 6;
            }
            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
                return 6;
            }
            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
-                return 6;
+                return 8;
            }
            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
-                return 7;
+                return 9;
            }
-            int n = wav.totalPCMFrameCount;
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
@ -518,6 +536,18 @@ int main(int argc, char ** argv) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
            if (params.diarize) {
                // convert to stereo, float
                pcmf32s.resize(2);
                pcmf32s[0].resize(n);
                pcmf32s[1].resize(n);
                for (int i = 0; i < n; i++) {
                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
                }
            }
        }
        // print system information
@ -552,32 +582,34 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-            wparams.print_realtime       = false;
+            wparams.print_realtime   = false;
-            wparams.print_progress       = false;
+            wparams.print_progress   = false;
-            wparams.print_timestamps     = !params.no_timestamps;
+            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.print_special    = params.print_special;
-            wparams.translate            = params.translate;
+            wparams.translate        = params.translate;
-            wparams.language             = params.language.c_str();
+            wparams.language         = params.language.c_str();
-            wparams.n_threads            = params.n_threads;
+            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
+            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms            = params.offset_t_ms;
+            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms          = params.duration_ms;
+            wparams.duration_ms      = params.duration_ms;
-            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
+            wparams.token_timestamps = params.output_wts || params.max_len > 0;
-            wparams.thold_pt             = params.word_thold;
+            wparams.thold_pt         = params.word_thold;
-            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.speed_up             = params.speed_up;
+            wparams.speed_up         = params.speed_up;
            whisper_print_user_data user_data = { &params, &pcmf32s };
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &params;
+                wparams.new_segment_callback_user_data = &user_data;
            }
            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 8;
+                return 10;
            }
        }
--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -0,0 +1,47 @@
 #
 # libstream
 #
 set(TARGET libstream)
 add_executable(${TARGET}
    emscripten.cpp
    )
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
 unset(EXTRA_FLAGS)
 if (WHISPER_WASM_SINGLE_FILE)
    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
    message(STATUS "Embedding WASM inside stream.js")
    add_custom_command(
        TARGET ${TARGET} POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E copy
        ${CMAKE_BINARY_DIR}/bin/libstream.js
        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
        )
 endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
    -s INITIAL_MEMORY=1024MB \
    -s TOTAL_MEMORY=1024MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
    ")
 #
 # stream.wasm
 #
 set(TARGET stream.wasm)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/stream.wasm/README.md
+++ b/examples/stream.wasm/README.md
@ -0,0 +1,20 @@
 # stream.wasm
 Real-time transcription in the browser using WebAssembly
 Online demo: https://whisper.ggerganov.com/stream/
 ## Build instructions
 ```bash
 # build using Emscripten (v3.1.2)
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
 # copy the produced page to your HTTP path
 cp bin/stream.wasm/*       /path/to/html/
 cp bin/libstream.worker.js /path/to/html/
 ```
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -0,0 +1,213 @@
 #include "ggml.h"
 #include "whisper.h"
 #include <emscripten.h>
 #include <emscripten/bind.h>
 #include <atomic>
 #include <cmath>
 #include <mutex>
 #include <string>
 #include <thread>
 #include <vector>
 constexpr int N_THREAD = 8;
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 std::mutex g_mutex;
 std::thread g_worker;
 std::atomic<bool> g_running(false);
 std::string g_status        = "";
 std::string g_status_forced = "";
 std::string g_transcribed   = "";
 std::vector<float> g_pcmf32;
 void stream_set_status(const std::string & status) {
    std::lock_guard<std::mutex> lock(g_mutex);
    g_status = status;
 }
 void stream_main(size_t index) {
    stream_set_status("loading data ...");
    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
    wparams.offset_ms        = 0;
    wparams.translate        = false;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.print_realtime   = false;
    wparams.print_progress   = false;
    wparams.print_timestamps = true;
    wparams.print_special    = false;
    wparams.max_tokens       = 32;
    wparams.audio_ctx        = 768; // partial encoder context for better performance
    wparams.language         = "en";
    printf("stream: using %d threads\n", N_THREAD);
    std::vector<float> pcmf32;
    // whisper context
    auto & ctx = g_contexts[index];
    // 5 seconds interval
    const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
    while (g_running) {
        stream_set_status("waiting for audio ...");
        {
            std::unique_lock<std::mutex> lock(g_mutex);
            if (g_pcmf32.size() < 1024) {
                lock.unlock();
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
                continue;
            }
            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
            g_pcmf32.clear();
        }
        {
            const auto t_start = std::chrono::high_resolution_clock::now();
            stream_set_status("running whisper ...");
            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
            if (ret != 0) {
                printf("whisper_full() failed: %d\n", ret);
                break;
            }
            const auto t_end = std::chrono::high_resolution_clock::now();
            printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
        }
        {
            std::string text_heard;
            {
                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = n_segments - 1; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                    printf("transcribed: %s\n", text);
                    text_heard += text;
                }
            }
            {
                std::lock_guard<std::mutex> lock(g_mutex);
                g_transcribed = text_heard;
            }
        }
    }
    if (index < g_contexts.size()) {
        whisper_free(g_contexts[index]);
        g_contexts[index] = nullptr;
    }
 }
 EMSCRIPTEN_BINDINGS(stream) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
                        g_worker.join();
                    }
                    g_worker = std::thread([i]() {
                        stream_main(i);
                    });
                    return i + 1;
                } else {
                    return (size_t) 0;
                }
            }
        }
        return (size_t) 0;
    }));
    emscripten::function("free", emscripten::optional_override([](size_t index) {
        if (g_running) {
            g_running = false;
        }
    }));
    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
        --index;
        if (index >= g_contexts.size()) {
            return -1;
        }
        if (g_contexts[index] == nullptr) {
            return -2;
        }
        {
            std::lock_guard<std::mutex> lock(g_mutex);
            const int n = audio["length"].as<int>();
            emscripten::val heap = emscripten::val::module_property("HEAPU8");
            emscripten::val memory = heap["buffer"];
            g_pcmf32.resize(n);
            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
            memoryView.call<void>("set", audio);
        }
        return 0;
    }));
    emscripten::function("get_transcribed", emscripten::optional_override([]() {
        std::string transcribed;
        {
            std::lock_guard<std::mutex> lock(g_mutex);
            transcribed = std::move(g_transcribed);
        }
        return transcribed;
    }));
    emscripten::function("get_status", emscripten::optional_override([]() {
        std::string status;
        {
            std::lock_guard<std::mutex> lock(g_mutex);
            status = g_status_forced.empty() ? g_status : g_status_forced;
        }
        return status;
    }));
    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
        {
            std::lock_guard<std::mutex> lock(g_mutex);
            g_status_forced = status;
        }
    }));
 }
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -0,0 +1,385 @@
 <!doctype html>
 <html lang="en-us">
    <head>
        <title>stream : Real-time Whisper transcription in WebAssembly</title>
        <style>
            #output {
                width: 100%;
                height: 100%;
                margin: 0 auto;
                margin-top: 10px;
                border-left: 0px;
                border-right: 0px;
                padding-left: 0px;
                padding-right: 0px;
                display: block;
                background-color: black;
                color: white;
                font-size: 10px;
                font-family: 'Lucida Console', Monaco, monospace;
                outline: none;
                white-space: pre;
                overflow-wrap: normal;
                overflow-x: scroll;
            }
        </style>
    </head>
    <body>
        <div id="main-container">
            <b>stream : Real-time Whisper transcription in WebAssembly</b>
            <br><br>
            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
            <br><br>
            <hr>
            Select the model you would like to use, click the "Start" button and start speaking
            <br><br>
            <div id="model-whisper">
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                -->
            </div>
            <br>
            <div id="input">
                <button id="start"  onclick="onStart()" disabled>Start</button>
                <button id="stop"   onclick="onStop()" disabled>Stop</button>
                <button id="clear"  onclick="clearCache()">Clear Cache</button>
            </div>
            <br>
            <div id="state">
                Status: <b><span id="state-status">not started</span></b>
                <pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
            </div>
            <hr>
            Debug output:
            <textarea id="output" rows="20"></textarea>
            <br>
            <b>Troubleshooting</b>
            <br><br>
            The page does some heavy computations, so make sure:
            <ul>
                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
            </ul>
            <div class="cell-version">
                <span>
                    |
                    Build time: <span class="nav-link">@GIT_DATE@</span> |
                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
                </span>
            </div>
        </div>
        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
            const kRestartRecording_s = 15;
            const kSampleRate = 16000;
            window.AudioContext = window.AudioContext || window.webkitAudioContext;
            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
            // web audio context
            var context = null;
            // audio data
            var audio = null;
            var audio0 = null;
            // the stream instance
            var instance = null;
            // model name
            var model_whisper = null;
            var Module = {
                print: printTextarea,
                printErr: printTextarea,
                setStatus: function(text) {
                    printTextarea('js: ' + text);
                },
                monitorRunDependencies: function(left) {
                },
                preRun: function() {
                    printTextarea('js: Preparing ...');
                },
                postRun: function() {
                    printTextarea('js: Initialized successfully!');
                }
            };
            //
            // fetch models
            //
            let dbVersion = 1
            let dbName    = 'whisper.ggerganov.com';
            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
            function storeFS(fname, buf) {
                // write to WASM file using FS_createDataFile
                // if the file exists, delete it
                try {
                    Module.FS_unlink(fname);
                } catch (e) {
                    // ignore
                }
                Module.FS_createDataFile("/", fname, buf, true, true);
                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                if (model_whisper != null) {
                    document.getElementById('start').disabled = false;
                    document.getElementById('stop' ).disabled = true;
                }
            }
            function loadWhisper(model) {
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                };
                let url     = urls[model];
                let dst     = 'whisper.bin';
                let size_mb = sizes[model];
                model_whisper = model;
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
                    el.innerHTML = Math.round(100*p) + '%';
                };
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }
            //
            // microphone
            //
            var mediaRecorder = null;
            var doRecording = false;
            var startTime = 0;
            function stopRecording() {
                Module.set_status("paused");
                doRecording = false;
                audio0 = null;
                audio = null;
                context = null;
            }
            function startRecording() {
                if (!context) {
                    context = new AudioContext({
                        sampleRate: 16000,
                        channelCount: 1,
                        echoCancellation: false,
                        autoGainControl:  true,
                        noiseSuppression: true,
                    });
                }
                Module.set_status("");
                document.getElementById('start').disabled = true;
                document.getElementById('stop').disabled = false;
                doRecording = true;
                startTime = Date.now();
                var chunks = [];
                var stream = null;
                navigator.mediaDevices.getUserMedia({audio: true, video: false})
                    .then(function(s) {
                        stream = s;
                        mediaRecorder = new MediaRecorder(stream);
                        mediaRecorder.ondataavailable = function(e) {
                            chunks.push(e.data);
                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
                            var reader = new FileReader();
                            reader.onload = function(event) {
                                var buf = new Uint8Array(reader.result);
                                if (!context) {
                                    return;
                                }
                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
                                    var source = offlineContext.createBufferSource();
                                    source.buffer = audioBuffer;
                                    source.connect(offlineContext.destination);
                                    source.start(0);
                                    offlineContext.startRendering().then(function(renderedBuffer) {
                                        audio = renderedBuffer.getChannelData(0);
                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
                                        if (audio0 != null) {
                                            audioAll.set(audio0, 0);
                                        }
                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
                                        if (instance) {
                                            Module.set_audio(instance, audioAll);
                                        }
                                    });
                                }, function(e) {
                                    audio = null;
                                });
                            }
                            reader.readAsArrayBuffer(blob);
                        };
                        mediaRecorder.onstop = function(e) {
                            if (doRecording) {
                                setTimeout(function() {
                                    startRecording();
                                });
                            }
                        };
                        mediaRecorder.start(5000);
                    })
                    .catch(function(err) {
                        printTextarea('js: error getting audio stream: ' + err);
                    });
                var interval = setInterval(function() {
                    if (!doRecording) {
                        clearInterval(interval);
                        mediaRecorder.stop();
                        stream.getTracks().forEach(function(track) {
                            track.stop();
                        });
                        document.getElementById('start').disabled = false;
                        document.getElementById('stop').disabled  = true;
                        mediaRecorder = null;
                    }
                    // if audio length is more than kRestartRecording_s seconds, restart recording
                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
                        if (doRecording) {
                            //printTextarea('js: restarting recording');
                            clearInterval(interval);
                            audio0 = audio;
                            audio = null;
                            mediaRecorder.stop();
                            stream.getTracks().forEach(function(track) {
                                track.stop();
                            });
                        }
                    }
                }, 250);
            }
            //
            // main
            //
            var nLines = 0;
            var intervalUpdate = null;
            var transcribedAll = '';
            function onStart() {
                if (!instance) {
                    instance = Module.init('whisper.bin');
                    if (instance) {
                        printTextarea("js: whisper initialized, instance: " + instance);
                    }
                }
                if (!instance) {
                    printTextarea("js: failed to initialize whisper");
                    return;
                }
                startRecording();
                intervalUpdate = setInterval(function() {
                    var transcribed = Module.get_transcribed();
                    if (transcribed != null && transcribed.length > 1) {
                        transcribedAll += transcribed + '<br>';
                        nLines++;
                        // if more than 10 lines, remove the first line
                        if (nLines > 10) {
                            var i = transcribedAll.indexOf('<br>');
                            if (i > 0) {
                                transcribedAll = transcribedAll.substring(i + 4);
                                nLines--;
                            }
                        }
                    }
                    document.getElementById('state-status').innerHTML = Module.get_status();
                    document.getElementById('state-transcribed').innerHTML = transcribedAll;
                }, 100);
            }
            function onStop() {
                stopRecording();
            }
        </script>
        <script type="text/javascript" src="stream.js"></script>
    </body>
 </html>
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -21,3 +21,7 @@ brew install sdl2
 make stream
 ```
 ## Web version
 This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -4,11 +4,6 @@
 #include "whisper.h"
 // third-party utilities
 // use your favorite implementations
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include <SDL.h>
 #include <SDL_audio.h>
@ -35,7 +30,6 @@ std::string to_timestamp(int64_t t) {
 // command-line parameters
 struct whisper_params {
    int32_t seed       = -1; // RNG seed, not used currently
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
@ -43,12 +37,11 @@ struct whisper_params {
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
-    bool speed_up             = false;
+    bool speed_up      = false;
-    bool verbose              = false;
+    bool translate     = false;
-    bool translate            = false;
+    bool no_context    = true;
-    bool no_context           = true;
+    bool print_special = false;
-    bool print_special_tokens = false;
+    bool no_timestamps = true;
    bool no_timestamps        = true;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -61,47 +54,24 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
-        if (arg == "-s" || arg == "--seed") {
+        if (arg == "-h" || arg == "--help") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "--step") {
            params.step_ms = std::stoi(argv[++i]);
        } else if (arg == "--length") {
            params.length_ms = std::stoi(argv[++i]);
        } else if (arg == "-c" || arg == "--capture") {
            params.capture_id = std::stoi(argv[++i]);
        } else if (arg == "-mt" || arg == "--max_tokens") {
            params.max_tokens = std::stoi(argv[++i]);
        } else if (arg == "-ac" || arg == "--audio_ctx") {
            params.audio_ctx = std::stoi(argv[++i]);
        } else if (arg == "-su" || arg == "--speed-up") {
            params.speed_up = true;
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
            params.translate = true;
        } else if (arg == "-kc" || arg == "--keep-context") {
            params.no_context = false;
        } else if (arg == "-l" || arg == "--language") {
            params.language = argv[++i];
            if (whisper_lang_id(params.language.c_str()) == -1) {
                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
                whisper_print_usage(argc, argv, params);
                exit(0);
            }
        } else if (arg == "-ps" || arg == "--print_special") {
            params.print_special_tokens = true;
        } else if (arg == "-nt" || arg == "--no_timestamps") {
            params.no_timestamps = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
            params.fname_out = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -116,23 +86,20 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
-    fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
+    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
-    fprintf(stderr, "            --length N       audio length in milliseconds (default: %d)\n", params.length_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -c ID,    --capture ID     capture device ID (default: -1)\n");
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -mt N,    --max_tokens N   maximum number of tokens per audio chunk (default: %d)\n", params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -ac N,    --audio_ctx N    audio context size (default: %d, 0 - all)\n", params.audio_ctx);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -su,      --speed-up       speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -kc,      --keep-context   keep text context from earlier audio (default: false)\n");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME     text output file name (default: no output to file)\n");
    fprintf(stderr, "\n");
 }
@ -148,56 +115,51 @@ bool audio_sdl_init(const int capture_id) {
        return false;
    }
-    if (g_dev_id_in == 0) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
        SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-        if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-            return (1);
+        return (1);
-        }
+    }
-        SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-        {
+    {
-            int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-            fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-            for (int i = 0; i < nDevices; i++) {
+        for (int i = 0; i < nDevices; i++) {
-                fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
            }
        }
    }
-    if (g_dev_id_in == 0) {
+    SDL_AudioSpec capture_spec_requested;
-        SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
        SDL_AudioSpec capture_spec_obtained;
-        SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_requested);
-        SDL_zero(capture_spec_obtained);
+    SDL_zero(capture_spec_obtained);
-        capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
+    capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
-        capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.format   = AUDIO_F32;
-        capture_spec_requested.channels = 1;
+    capture_spec_requested.channels = 1;
-        capture_spec_requested.samples  = 1024;
+    capture_spec_requested.samples  = 1024;
-        if (capture_id >= 0) {
+    if (capture_id >= 0) {
-            fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-            g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-        } else {
+    } else {
-            fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-            g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-        }
+    }
-        if (!g_dev_id_in) {
+    if (!g_dev_id_in) {
-            fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-            g_dev_id_in = 0;
+        g_dev_id_in = 0;
-        } else {
+    } else {
-            fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
-            fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
-            fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
-            fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
-            fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
+        fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
        }
    }
    return true;
 }
@ -211,10 +173,6 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (params.seed < 0) {
        params.seed = time(NULL);
    }
    // init audio
    if (!audio_sdl_init(params.capture_id)) {
@ -222,6 +180,12 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -280,16 +244,22 @@ int main(int argc, char ** argv) {
    // main audio loop
    while (is_running) {
-        // process SDL events:
+        // handle Ctrl + C
-        SDL_Event event;
+        {
-        while (SDL_PollEvent(&event)) {
+            SDL_Event event;
-            switch (event.type) {
+            while (SDL_PollEvent(&event)) {
-                case SDL_QUIT:
+                switch (event.type) {
-                    {
+                    case SDL_QUIT:
-                        is_running = false;
+                        {
-                    } break;
+                            is_running = false;
-                default:
+                        } break;
-                    break;
+                    default:
                        break;
                }
            }
            if (!is_running) {
                break;
            }
        }
@ -331,22 +301,22 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-            wparams.print_progress       = false;
+            wparams.print_progress   = false;
-            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.print_special    = params.print_special;
-            wparams.print_realtime       = false;
+            wparams.print_realtime   = false;
-            wparams.print_timestamps     = !params.no_timestamps;
+            wparams.print_timestamps = !params.no_timestamps;
-            wparams.translate            = params.translate;
+            wparams.translate        = params.translate;
-            wparams.no_context           = true;
+            wparams.no_context       = true;
-            wparams.single_segment       = true;
+            wparams.single_segment   = true;
-            wparams.max_tokens           = params.max_tokens;
+            wparams.max_tokens       = params.max_tokens;
-            wparams.language             = params.language.c_str();
+            wparams.language         = params.language.c_str();
-            wparams.n_threads            = params.n_threads;
+            wparams.n_threads        = params.n_threads;
-            wparams.audio_ctx            = params.audio_ctx;
+            wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up             = params.speed_up;
+            wparams.speed_up         = params.speed_up;
-            wparams.prompt_tokens        = params.no_context ? nullptr : prompt_tokens.data();
+            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens      = params.no_context ? 0       : prompt_tokens.size();
+            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
@ -414,6 +384,10 @@ int main(int argc, char ** argv) {
        }
    }
    if (g_dev_id_in >= 0) {
        SDL_CloseAudioDevice(g_dev_id_in);
    }
    whisper_print_timings(ctx);
    whisper_free(ctx);
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -45,3 +45,4 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
 set(TARGET talk.wasm)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -2,9 +2,9 @@
 Talk with an Artificial Intelligence in your browser:
-https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4
+[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
-Online demo: https://talk.ggerganov.com
+Online demo: https://whisper.ggerganov.com/talk/
 ## How it works?
@ -50,6 +50,21 @@ on a phone or a tablet. Hopefully, in the near future this will become supported
 - Better UI (contributions are welcome)
 - Better GPT-2 prompting
 ## Build instructions
 ```bash
 # build using Emscripten (v3.1.2)
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
 # copy the produced page to your HTTP path
 cp bin/talk.wasm/*       /path/to/html/
 cp bin/libtalk.worker.js /path/to/html/
 ```
 ## Feedback
 If you have any comments or ideas for improvement, please drop a comment in the following discussion:
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -51,20 +51,20 @@ void talk_main(size_t index) {
    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-    wparams.n_threads            = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-    wparams.offset_ms            = 0;
+    wparams.offset_ms        = 0;
-    wparams.translate            = false;
+    wparams.translate        = false;
-    wparams.no_context           = true;
+    wparams.no_context       = true;
-    wparams.single_segment       = true;
+    wparams.single_segment   = true;
-    wparams.print_realtime       = false;
+    wparams.print_realtime   = false;
-    wparams.print_progress       = false;
+    wparams.print_progress   = false;
-    wparams.print_timestamps     = true;
+    wparams.print_timestamps = true;
-    wparams.print_special_tokens = false;
+    wparams.print_special    = false;
-    wparams.max_tokens           = 32;
+    wparams.max_tokens       = 32;
-    wparams.audio_ctx            = 768;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
-    wparams.language             = "en";
+    wparams.language         = "en";
    g_gpt2 = gpt2_init("gpt-2.bin");
@ -75,9 +75,9 @@ void talk_main(size_t index) {
    // whisper context
    auto & ctx = g_contexts[index];
-    const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
+    const int64_t step_samples   = 2*WHISPER_SAMPLE_RATE;
    const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
    const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
    const int64_t step_ms        = (step_samples*1000)/WHISPER_SAMPLE_RATE;
    auto t_last = std::chrono::high_resolution_clock::now();
@ -111,7 +111,7 @@ void talk_main(size_t index) {
            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
        }
-        // if energy in during last second is above threshold, then skip
+        // VAD: if energy in during last second is above threshold, then skip
        {
            float energy_all = 0.0f;
            float energy_1s  = 0.0f;
@ -133,13 +133,11 @@ void talk_main(size_t index) {
            }
        }
-        talk_set_status("processing ...");
+        talk_set_status("processing audio (whisper)...");
        g_force_speak = false;
        t_last = t_now;
-        {
+        if (!g_force_speak) {
            const auto t_start = std::chrono::high_resolution_clock::now();
            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
@ -156,18 +154,22 @@ void talk_main(size_t index) {
        {
            std::string text_heard;
-            const int n_segments = whisper_full_n_segments(ctx);
+            if (!g_force_speak) {
-            for (int i = n_segments - 1; i < n_segments; ++i) {
+                const int n_segments = whisper_full_n_segments(ctx);
-                const char * text = whisper_full_get_segment_text(ctx, i);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                    printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
-                text_heard += text;
+                    text_heard += text;
                }
            }
            g_force_speak = false;
            // remove text between brackets using regex
            {
                std::regex re("\\[.*?\\]");
@ -190,7 +192,7 @@ void talk_main(size_t index) {
            text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
            text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-            talk_set_status("'" + text_heard + "' - thinking how to respond ...");
+            talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
            const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -812,9 +812,9 @@ I'm fine, thanks. How are you?
 Thanks, I'm fine too. What are you doing?
 I'm just sitting here.
 It's a lovely day, isn't it?
-Yes, it is.
+Yes, it is. I love the weather this time of year.
-Did you know that I'm a robot?
+I wish it would rain a little bit.
-I wasn't aware of that.
+Me too.
 )";
    std::mt19937 rng;
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -35,7 +35,7 @@
            <ul>
                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
-                <li><a href="https://github.com/ggerganov/ggml/tree/master/examples/gpt-2">OpenAI's GPT-2</a> to generate text responses</li>
+                <li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
            </ul>
@ -51,7 +51,7 @@
            <br><br>
            <div id="model-whisper">
-                <span id="model-whisper-status">Whisper model:</span>
+                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <span id="fetch-whisper-progress"></span>
@ -64,7 +64,7 @@
            <br>
            <div id="model-gpt-2">
-                <span id="model-gpt-2-status">GPT-2 model:</span>
+                GPT-2 model: <span id="model-gpt-2-status"></span>
                <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
                <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
                <span id="fetch-gpt-2-progress"></span>
@ -143,7 +143,7 @@
            <br><br>
-            Here is a short video of the demo in action: <a href="https://youtu.be/2om-7tFMaNs">https://youtu.be/2om-7tFMaNs</a>
+            Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
            <br><br>
@ -158,20 +158,8 @@
            </div>
        </div>
        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
            var printTextarea = (function() {
                    var element = document.getElementById('output');
                    if (element) element.alue = ''; // clear browser cache
                    return function(text) {
                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
                        console.log(text);
                        if (element) {
                            element.value += text + "\n";
                            element.scrollTop = element.scrollHeight; // focus on bottom
                        }
                    };
                })();
            const kRestartRecording_s = 15;
            const kSampleRate = 16000;
@ -218,6 +206,7 @@
                    if (voices.length == 0) {
                        el.innerHTML = '<option value="0">No voices available</option>';
                    } else {
                        // populate voice list
                        var n = 0;
                        voices.forEach(function(voice, i) {
                            if (!voice.lang.startsWith('en')) return;
@ -245,17 +234,14 @@
                }
            };
            // helper function
            function convertTypedArray(src, type) {
                var buffer = new ArrayBuffer(src.byteLength);
                var baseView = new src.constructor(buffer).set(src);
                return new type(buffer);
            }
            //
            // fetch models
            //
            let dbVersion = 1
            let dbName    = 'whisper.ggerganov.com';
            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
            function storeFS(fname, buf) {
                // write to WASM file using FS_createDataFile
                // if the file exists, delete it
@ -267,180 +253,25 @@
                Module.FS_createDataFile("/", fname, buf, true, true);
-                printTextarea('js: stored model: ' + fname + ' size: ' + buf.length);
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
                if (fname == 'whisper.bin') {
-                    document.getElementById('model-whisper').innerHTML = 'Whisper model: loaded "' + model_whisper + '"!';
+                    document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                } else if (fname == 'gpt-2.bin') {
-                    document.getElementById('model-gpt-2').innerHTML = 'GPT-2 model: loaded "' + model_gpt_2 + '"!';
+                    document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
                }
                if (model_whisper != null && model_gpt_2 != null) {
                    document.getElementById('start').disabled = false;
-                    document.getElementById('stop').disabled  = false;
+                    document.getElementById('stop' ).disabled = false;
                    document.getElementById('voice').disabled = false;
                }
            }
            let dbVersion = 1
            let dbName    = 'talk.ggerganov.com';
            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
            // fetch a remote file from remote URL using the Fetch API
            async function fetchRemote(url, elProgress) {
                printTextarea('js: downloading with fetch()...');
                const response = await fetch(
                    url,
                    {
                        method: 'GET',
                        headers: {
                            'Content-Type': 'application/octet-stream',
                        },
                    }
                );
                if (!response.ok) {
                    printTextarea('js: failed to fetch ' + url);
                    return;
                }
                const contentLength = response.headers.get('content-length');
                const total = parseInt(contentLength, 10);
                const reader = response.body.getReader();
                var chunks = [];
                var receivedLength = 0;
                var progressLast = -1;
                while (true) {
                    const { done, value } = await reader.read();
                    if (done) {
                        break;
                    }
                    chunks.push(value);
                    receivedLength += value.length;
                    if (contentLength) {
                        // update progress bar element with the new percentage
                        elProgress.innerHTML = Math.round((receivedLength / total) * 100) + '%';
                        var progressCur = Math.round((receivedLength / total) * 10);
                        if (progressCur != progressLast) {
                            printTextarea('js: fetching ' + 10*progressCur + '% ...');
                            progressLast = progressCur;
                        }
                    }
                }
                var chunksAll = new Uint8Array(receivedLength);
                var position = 0;
                for (var chunk of chunks) {
                    chunksAll.set(chunk, position);
                    position += chunk.length;
                }
                return chunksAll;
            }
            // load remote data
            // - check if the data is already in the IndexedDB
            // - if not, fetch it from the remote URL and store it in the IndexedDB
            // - store it in WASM memory
            function loadRemote(url, dst, elProgress, size_mb) {
                // query the storage quota and print it
                navigator.storage.estimate().then(function (estimate) {
                    printTextarea('js: storage quota: ' + estimate.quota + ' bytes');
                    printTextarea('js: storage usage: ' + estimate.usage + ' bytes');
                });
                // check if the data is already in the IndexedDB
                var request = indexedDB.open(dbName, dbVersion);
                request.onupgradeneeded = function (event) {
                    var db = event.target.result;
                    if (db.version == 1) {
                        var objectStore = db.createObjectStore('models', { autoIncrement: false });
                        printTextarea('js: created IndexedDB ' + db.name + ' version ' + db.version);
                    } else {
                        // clear the database
                        var objectStore = event.currentTarget.transaction.objectStore('models');
                        objectStore.clear();
                        printTextarea('js: cleared IndexedDB ' + db.name + ' version ' + db.version);
                    }
                };
                request.onsuccess = function (event) {
                    var db = event.target.result;
                    var transaction = db.transaction(['models'], 'readonly');
                    var objectStore = transaction.objectStore('models');
                    var request = objectStore.get(url);
                    request.onsuccess = function (event) {
                        if (request.result) {
                            printTextarea('js: "' + url + '" is already in the IndexedDB');
                            storeFS(dst, request.result);
                        } else {
                            // data is not in the IndexedDB
                            printTextarea('js: "' + url + '" is not in the IndexedDB');
                            // alert and ask the user to confirm
                            if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) {
                                var el;
                                el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                                el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                                el = document.getElementById('fetch-gpt-2-small') ;    if (el) el.style.display = 'inline-block';
                                return;
                            }
                            fetchRemote(url, elProgress).then(function (data) {
                                if (data) {
                                    // store the data in the IndexedDB
                                    var request = indexedDB.open(dbName, dbVersion);
                                    request.onsuccess = function (event) {
                                        var db = event.target.result;
                                        var transaction = db.transaction(['models'], 'readwrite');
                                        var objectStore = transaction.objectStore('models');
                                        var request = objectStore.put(data, url);
                                        request.onsuccess = function (event) {
                                            printTextarea('js: "' + url + '" stored in the IndexedDB');
                                            storeFS(dst, data);
                                        };
                                        request.onerror = function (event) {
                                            printTextarea('js: failed to store "' + url + '" in the IndexedDB');
                                        };
                                    };
                                }
                            });
                        }
                    };
                    request.onerror = function (event) {
                        printTextarea('js: failed to get data from the IndexedDB');
                    };
                };
                request.onerror = function (event) {
                    printTextarea('js: failed to open IndexedDB');
                };
                request.onblocked = function (event) {
                    printTextarea('js: failed to open IndexedDB: blocked');
                };
                request.onabort = function (event) {
                    printTextarea('js: failed to open IndexedDB: abort');
                };
            }
            function loadWhisper(model) {
                let urls = {
-                    'tiny.en': 'https://talk.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'base.en': 'https://talk.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                };
                let sizes = {
@ -450,22 +281,33 @@
                let url     = urls[model];
                let dst     = 'whisper.bin';
                let el      = document.getElementById('fetch-whisper-progress');
                let size_mb = sizes[model];
                model_whisper = model;
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('model-whisper-status').innerHTML = 'Whisper model: loading "' + model + '" ... ';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
                    el.innerHTML = Math.round(100*p) + '%';
                };
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
-                loadRemote(url, dst, el, size_mb);
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }
            function loadGPT2(model) {
                let urls = {
-                    'small':  'https://talk.ggerganov.com/ggml-model-gpt-2-117M.bin',
+                    'small':  'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
-                    'medium': 'https://talk.ggerganov.com/ggml-model-gpt-2-345M.bin',
+                    'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
                };
                let sizes = {
@ -475,15 +317,25 @@
                let url     = urls[model];
                let dst     = 'gpt-2.bin';
                let el      = document.getElementById('fetch-gpt-2-progress');
                let size_mb = sizes[model];
                model_gpt_2 = model;
                document.getElementById('fetch-gpt-2-small').style.display = 'none';
-                document.getElementById('model-gpt-2-status').innerHTML = 'GPT-2 model: loading "' + model + '" ... ';
+                document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-gpt-2-progress');
                    el.innerHTML = Math.round(100*p) + '%';
                };
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
                };
-                loadRemote(url, dst, el, size_mb);
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }
            //
@ -507,7 +359,7 @@
                    context = new AudioContext({
                        sampleRate: 16000,
                        channelCount: 1,
-                        echoCancellation: true,
+                        echoCancellation: false,
                        autoGainControl:  true,
                        noiseSuppression: true,
                    });
@ -652,12 +504,6 @@
                Module.force_speak(instance);
            }
            async function clearCache() {
                if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
                    indexedDB.deleteDatabase(dbName);
                }
            }
            //
            // main
            //
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -161,14 +161,14 @@ void AudioInputCallback(void * inUserData,
    // run the model
    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-    params.print_realtime       = true;
+    params.print_realtime   = true;
-    params.print_progress       = false;
+    params.print_progress   = false;
-    params.print_timestamps     = true;
+    params.print_timestamps = true;
-    params.print_special_tokens = false;
+    params.print_special    = false;
-    params.translate            = false;
+    params.translate        = false;
-    params.language             = "en";
+    params.language         = "en";
-    params.n_threads            = 4;
+    params.n_threads        = 4;
-    params.offset_ms            = 0;
+    params.offset_ms        = 0;
    CFTimeInterval startTime = CACurrentMediaTime();
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -1,4 +1,5 @@
 set(TARGET whisper.wasm)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
 configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -26,10 +26,9 @@ Link: https://whisper.ggerganov.com
 ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)
 ## Build instructions
-```bash
+```bash (v3.1.2)
 # build using Emscripten
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
@ -38,6 +37,6 @@ emcmake cmake ..
 make -j
 # copy the produced page to your HTTP path
-cp bin/whisper.wasm/index.html /path/to/html/
+cp bin/whisper.wasm/*       /path/to/html/
-cp bin/whisper.wasm/whisper.js /path/to/html/
+cp bin/libwhisper.worker.js /path/to/html/
-cp bin/libwhisper.worker.js    /path/to/html/
+```
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -45,13 +45,14 @@
            <br><br><hr>
            <div id="model">
-                Model:
+                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"    onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"    onclick="loadWhisper('base')">base (142 MB)</button>
                <span id="fetch-whisper-progress"></span>
-                <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
            </div>
            <br>
@ -185,6 +186,7 @@
            </div>
        </div>
        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
            // TODO: convert audio buffer to WAV
            function setAudio(audio) {
@ -204,28 +206,15 @@
            function changeInput(input) {
                if (input == 'file') {
                    document.getElementById('input_file').style.display = 'block';
-                    document.getElementById('input_mic').style.display = 'none';
+                    document.getElementById('input_mic' ).style.display = 'none';
-                    document.getElementById('progress').style.display = 'none';
+                    document.getElementById('progress'  ).style.display = 'none';
                } else {
                    document.getElementById('input_file').style.display = 'none';
-                    document.getElementById('input_mic').style.display = 'block';
+                    document.getElementById('input_mic' ).style.display = 'block';
-                    document.getElementById('progress').style.display = 'block';
+                    document.getElementById('progress'  ).style.display = 'block';
                }
            }
            var printTextarea = (function() {
                    var element = document.getElementById('output');
                    if (element) element.alue = ''; // clear browser cache
                    return function(text) {
                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
                        console.log(text);
                        if (element) {
                            element.value += text + "\n";
                            element.scrollTop = element.scrollHeight; // focus on bottom
                        }
                    };
                })();
            var Module = {
                print: printTextarea,
                printErr: printTextarea,
@ -250,7 +239,7 @@
            // the whisper instance
            var instance = null;
-            var model_fname = '';
+            var model_whisper = '';
            // helper function
            function convertTypedArray(src, type) {
@ -278,8 +267,11 @@
                Module.FS_createDataFile("/", fname, buf, true, true);
-                model_fname = fname;
+                model_whisper = fname;
-                printTextarea('js: stored model: ' + fname + ' size: ' + buf.length);
+
                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
            }
            function loadFile(event, fname) {
@ -288,8 +280,8 @@
                    return;
                }
-                printTextarea("js: loading model: " + file.name + ", size: " + file.size + " bytes");
+                printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
-                printTextarea('js: please wait ...');
+                printTextarea('loadFile: please wait ...');
                var reader = new FileReader();
                reader.onload = function(event) {
@ -300,160 +292,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
-                document.getElementById('fetch-whisper-base').style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
-            }
+                document.getElementById('whisper-file'         ).style.display = 'none';
-
+                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            // fetch a remote file from remote URL using the Fetch API
            async function fetchRemote(url, elProgress) {
                printTextarea('js: downloading with fetch()...');
                const response = await fetch(
                    url,
                    {
                        method: 'GET',
                        headers: {
                            'Content-Type': 'application/octet-stream',
                        },
                    }
                );
                if (!response.ok) {
                    printTextarea('js: failed to fetch ' + url);
                    return;
                }
                const contentLength = response.headers.get('content-length');
                const total = parseInt(contentLength, 10);
                const reader = response.body.getReader();
                var chunks = [];
                var receivedLength = 0;
                var progressLast = -1;
                while (true) {
                    const { done, value } = await reader.read();
                    if (done) {
                        break;
                    }
                    chunks.push(value);
                    receivedLength += value.length;
                    if (contentLength) {
                        // update progress bar element with the new percentage
                        elProgress.innerHTML = Math.round((receivedLength / total) * 100) + '%';
                        var progressCur = Math.round((receivedLength / total) * 10);
                        if (progressCur != progressLast) {
                            printTextarea('js: fetching ' + 10*progressCur + '% ...');
                            progressLast = progressCur;
                        }
                    }
                }
                var chunksAll = new Uint8Array(receivedLength);
                var position = 0;
                for (var chunk of chunks) {
                    chunksAll.set(chunk, position);
                    position += chunk.length;
                }
                return chunksAll;
            }
            // load remote data
            // - check if the data is already in the IndexedDB
            // - if not, fetch it from the remote URL and store it in the IndexedDB
            // - store it in WASM memory
            function loadRemote(url, dst, elProgress, size_mb) {
                // query the storage quota and print it
                navigator.storage.estimate().then(function (estimate) {
                    printTextarea('js: storage quota: ' + estimate.quota + ' bytes');
                    printTextarea('js: storage usage: ' + estimate.usage + ' bytes');
                });
                // check if the data is already in the IndexedDB
                var request = indexedDB.open(dbName, dbVersion);
                request.onupgradeneeded = function (event) {
                    var db = event.target.result;
                    if (db.version == 1) {
                        var objectStore = db.createObjectStore('models', { autoIncrement: false });
                        printTextarea('js: created IndexedDB ' + db.name + ' version ' + db.version);
                    } else {
                        // clear the database
                        var objectStore = event.currentTarget.transaction.objectStore('models');
                        objectStore.clear();
                        printTextarea('js: cleared IndexedDB ' + db.name + ' version ' + db.version);
                    }
                };
                request.onsuccess = function (event) {
                    var db = event.target.result;
                    var transaction = db.transaction(['models'], 'readonly');
                    var objectStore = transaction.objectStore('models');
                    var request = objectStore.get(url);
                    request.onsuccess = function (event) {
                        if (request.result) {
                            printTextarea('js: "' + url + '" is already in the IndexedDB');
                            storeFS(dst, request.result);
                        } else {
                            // data is not in the IndexedDB
                            printTextarea('js: "' + url + '" is not in the IndexedDB');
                            // alert and ask the user to confirm
                            if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) {
                                var el;
                                el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                                el = document.getElementById('fetch-whisper-tiny'); if (el) el.style.display = 'inline-block';
                                el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                                el = document.getElementById('fetch-whisper-base'); if (el) el.style.display = 'inline-block';
                                return;
                            }
                            fetchRemote(url, elProgress).then(function (data) {
                                if (data) {
                                    // store the data in the IndexedDB
                                    var request = indexedDB.open(dbName, dbVersion);
                                    request.onsuccess = function (event) {
                                        var db = event.target.result;
                                        var transaction = db.transaction(['models'], 'readwrite');
                                        var objectStore = transaction.objectStore('models');
                                        var request = objectStore.put(data, url);
                                        request.onsuccess = function (event) {
                                            printTextarea('js: "' + url + '" stored in the IndexedDB');
                                            storeFS(dst, data);
                                        };
                                        request.onerror = function (event) {
                                            printTextarea('js: failed to store "' + url + '" in the IndexedDB');
                                        };
                                    };
                                }
                            });
                        }
                    };
                    request.onerror = function (event) {
                        printTextarea('js: failed to get data from the IndexedDB');
                    };
                };
                request.onerror = function (event) {
                    printTextarea('js: failed to open IndexedDB');
                };
                request.onblocked = function (event) {
                    printTextarea('js: failed to open IndexedDB: blocked');
                };
                request.onabort = function (event) {
                    printTextarea('js: failed to open IndexedDB: abort');
                };
            }
            function loadWhisper(model) {
@ -473,17 +315,33 @@
                let url     = urls[model];
                let dst     = 'whisper.bin';
                let el      = document.getElementById('fetch-whisper-progress');
                let size_mb = sizes[model];
                model_whisper = model;
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
-                document.getElementById('fetch-whisper-base').style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
                document.getElementById('whisper-file'         ).style.display = 'none';
                document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
                    el.innerHTML = Math.round(100*p) + '%';
                };
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'         ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
                };
-                loadRemote(url, dst, el, size_mb);
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }
            //
@ -651,7 +509,7 @@
                    if (instance) {
                        printTextarea("js: whisper initialized, instance: " + instance);
-                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_fname;
+                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_whisper;
                    }
                }
@ -668,7 +526,6 @@
                if (instance) {
                    printTextarea('');
                    printTextarea('js: processing - this might take a while ...');
                    printTextarea('js: the page will be unresponsive until the processing is completed');
                    printTextarea('');
                    setTimeout(function() {
--- a/models/README.md
+++ b/models/README.md
@ -41,5 +41,24 @@ https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
 ## Model files for testing purposes
-The model files pefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for testing purposes.
+The model files prefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for
-They are directly included in this repository for convenience and the Github Actions CI uses them to run various sanitizer tests.
+testing purposes. They are directly included in this repository for convenience and the Github Actions CI uses them to
 run various sanitizer tests.
 ## Fine-tuned models
 There are community efforts for creating fine-tuned Whisper models using extra training data. For example, this
 [blog post](https://huggingface.co/blog/fine-tune-whisper) describes a method for fine-tuning using Hugging Face (HF)
 Transformer implementation of Whisper. The produced models are in slightly different format compared to the original
 OpenAI format. To read the HF models you can use the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script like this:
 ```bash
 git clone https://github.com/openai/whisper
 git clone https://github.com/ggerganov/whisper.cpp
 # clone HF fine-tuned model (this is just an example)
 git clone https://huggingface.co/openai/whisper-base.en
 # convert the model to ggml
 python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
 ```
--- a/whisper.cpp
+++ b/whisper.cpp
@ -518,15 +518,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
        wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
        wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
        // this is the total memory required to run the inference
        const size_t mem_required =
                   wctx.buf_model->size() +
                   wctx.buf_memory.size() +
                   wctx.buf_compute.size() +
                   wctx.buf_compute_layer.size();
        fprintf(stderr, "%s: mem_required  = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
    }
    // load mel filters
@ -599,11 +590,21 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        }
    }
    {
        // this is the total memory required to run the inference
        const size_t mem_required =
                   wctx.buf_model->size() +
                   wctx.buf_memory.size() +
                   wctx.buf_compute.size() +
                   wctx.buf_compute_layer.size();
        fprintf(stderr, "%s: mem_required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
    }
    // for the big tensors, we have the option to store the data in 16-bit floats
    // in order to save memory and also to speed up the computation
    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
    size_t ctx_size = 0;
    size_t ctx_mem_size = 0;
@ -722,7 +723,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }
    // create the ggml context
@ -983,7 +984,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
            ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
-        fprintf(stderr, "%s: memory size = %8.2f MB\n", __func__, memory_size/1024.0/1024.0);
+        fprintf(stderr, "%s: memory size   = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
    }
    // load weights
@ -1047,7 +1048,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            model.n_loaded++;
        }
-        fprintf(stderr, "%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+        fprintf(stderr, "%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
        if (model.n_loaded == 0) {
            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
@ -2379,6 +2380,12 @@ void whisper_print_timings(struct whisper_context * ctx) {
    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
 }
 void whisper_reset_timings(struct whisper_context * ctx) {
    ctx->t_sample_us = 0;
    ctx->t_encode_us = 0;
    ctx->t_decode_us = 0;
 }
 ////////////////////////////////////////////////////////////////////////////
 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2388,92 +2395,92 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        case WHISPER_SAMPLING_GREEDY:
            {
                result = {
-                    /*.strategy             =*/ WHISPER_SAMPLING_GREEDY,
+                    /*.strategy         =*/ WHISPER_SAMPLING_GREEDY,
-                    /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+                    /*.n_threads        =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-                    /*.n_max_text_ctx       =*/ 16384,
+                    /*.n_max_text_ctx   =*/ 16384,
-                    /*.offset_ms            =*/ 0,
+                    /*.offset_ms        =*/ 0,
-                    /*.duration_ms          =*/ 0,
+                    /*.duration_ms      =*/ 0,
-                    /*.translate            =*/ false,
+                    /*.translate        =*/ false,
-                    /*.no_context           =*/ false,
+                    /*.no_context       =*/ false,
-                    /*.single_segment       =*/ false,
+                    /*.single_segment   =*/ false,
-                    /*.print_special_tokens =*/ false,
+                    /*.print_special    =*/ false,
-                    /*.print_progress       =*/ true,
+                    /*.print_progress   =*/ true,
-                    /*.print_realtime       =*/ false,
+                    /*.print_realtime   =*/ false,
-                    /*.print_timestamps     =*/ true,
+                    /*.print_timestamps =*/ true,
-                    /*.token_timestamps     =*/ false,
+                    /*.token_timestamps =*/ false,
-                    /*.thold_pt             =*/ 0.01f,
+                    /*.thold_pt         =*/ 0.01f,
-                    /*.thold_ptsum          =*/ 0.01f,
+                    /*.thold_ptsum      =*/ 0.01f,
-                    /*.max_len              =*/ 0,
+                    /*.max_len          =*/ 0,
-                    /*.max_tokens           =*/ 0,
+                    /*.max_tokens       =*/ 0,
-                    /*.speed_up             =*/ false,
+                    /*.speed_up         =*/ false,
-                    /*.audio_ctx            =*/ 0,
+                    /*.audio_ctx        =*/ 0,
-                    /*.prompt_tokens        =*/ nullptr,
+                    /*.prompt_tokens    =*/ nullptr,
-                    /*.prompt_n_tokens      =*/ 0,
+                    /*.prompt_n_tokens  =*/ 0,
-                    /*.language             =*/ "en",
+                    /*.language         =*/ "en",
-                    /*.greedy               =*/ {
+                    /*.greedy           =*/ {
                        /*.n_past =*/ 0,
                    },
-                    /*.beam_search          =*/ {
+                    /*.beam_search      =*/ {
                        /*.n_past     =*/ -1,
                        /*.beam_width =*/ -1,
                        /*.n_best     =*/ -1,
                    },
-                    /*.new_segment_callback =*/ nullptr,
+                    /*.new_segment_callback           =*/ nullptr,
                    /*.new_segment_callback_user_data =*/ nullptr,
                };
            } break;
        case WHISPER_SAMPLING_BEAM_SEARCH:
            {
                result = {
-                    /*.strategy             =*/ WHISPER_SAMPLING_BEAM_SEARCH,
+                    /*.strategy         =*/ WHISPER_SAMPLING_BEAM_SEARCH,
-                    /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+                    /*.n_threads        =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-                    /*.n_max_text_ctx       =*/ 16384,
+                    /*.n_max_text_ctx   =*/ 16384,
-                    /*.offset_ms            =*/ 0,
+                    /*.offset_ms        =*/ 0,
-                    /*.duration_ms          =*/ 0,
+                    /*.duration_ms      =*/ 0,
-                    /*.translate            =*/ false,
+                    /*.translate        =*/ false,
-                    /*.no_context           =*/ false,
+                    /*.no_context       =*/ false,
-                    /*.single_segment       =*/ false,
+                    /*.single_segment   =*/ false,
-                    /*.print_special_tokens =*/ false,
+                    /*.print_special    =*/ false,
-                    /*.print_progress       =*/ true,
+                    /*.print_progress   =*/ true,
-                    /*.print_realtime       =*/ false,
+                    /*.print_realtime   =*/ false,
-                    /*.print_timestamps     =*/ true,
+                    /*.print_timestamps =*/ true,
-                    /*.token_timestamps     =*/ false,
+                    /*.token_timestamps =*/ false,
-                    /*.thold_pt             =*/ 0.01f,
+                    /*.thold_pt         =*/ 0.01f,
-                    /*.thold_ptsum          =*/ 0.01f,
+                    /*.thold_ptsum      =*/ 0.01f,
-                    /*.max_len              =*/ 0,
+                    /*.max_len          =*/ 0,
-                    /*.max_tokens           =*/ 0,
+                    /*.max_tokens       =*/ 0,
-                    /*.speed_up             =*/ false,
+                    /*.speed_up         =*/ false,
-                    /*.audio_ctx            =*/ 0,
+                    /*.audio_ctx        =*/ 0,
-                    /*.prompt_tokens        =*/ nullptr,
+                    /*.prompt_tokens    =*/ nullptr,
-                    /*.prompt_n_tokens      =*/ 0,
+                    /*.prompt_n_tokens  =*/ 0,
-                    /*.language             =*/ "en",
+                    /*.language         =*/ "en",
-                    /*.greedy               =*/ {
+                    /*.greedy           =*/ {
                        /*.n_past =*/ -1,
                    },
-                    /*.beam_search          =*/ {
+                    /*.beam_search      =*/ {
                        /*.n_past     =*/ 0,
                        /*.beam_width =*/ 10,
                        /*.n_best     =*/ 5,
                    },
-                    /*.new_segment_callback =*/ nullptr,
+                    /*.new_segment_callback           =*/ nullptr,
                    /*.new_segment_callback_user_data =*/ nullptr,
                };
            } break;
@ -2761,7 +2768,7 @@ int whisper_full(
                //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
                //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
-                if (params.print_special_tokens == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
+                if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
                } else {
                    text += whisper_token_to_str(ctx, tokens_cur[i].id);
                }
--- a/whisper.h
+++ b/whisper.h
@ -167,6 +167,7 @@ extern "C" {
    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
    ////////////////////////////////////////////////////////////////////////////
@ -192,7 +193,7 @@ extern "C" {
        bool translate;
        bool no_context;
        bool single_segment; // force single segment output (useful for streaming)
-        bool print_special_tokens;
+        bool print_special;
        bool print_progress;
        bool print_realtime;
        bool print_timestamps;