Merge branch 'ggerganov:master' into master

3 years ago · c1808cd641
parent fc0e984846 3c390ffe38
commit c1808cd641
31 changed files with 2153 additions and 832 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,10 +13,12 @@ build-sanitize-thread/

 main
 stream
+command
 bench
 sync.sh
 compile_commands.json

+examples/arm_neon.h
 examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
 examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
--- a/7
+++ b/7
@ -134,7 +134,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
-	rm -f *.o main stream bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command bench libwhisper.a libwhisper.so

 #
 # Examples
@ -149,6 +149,9 @@ main: examples/main/main.cpp ggml.o whisper.o
 stream: examples/stream/stream.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

+command: examples/command/command.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

@ -198,7 +201,7 @@ tiny.en tiny base.en base small.en small medium.en medium large: main
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
-		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+		echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-$@.bin -f $$f ; \
--- a/README.md
+++ b/README.md
@ -36,9 +36,11 @@ As an example, here is a video of running the model on an iPhone 13 device - ful

 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4

-Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
+You can also easily make your own offline voice assistant application:
+
+https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4

-https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4
+Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)

 ## Implementation details

@ -100,27 +102,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,       --help           show this help message and exit
-  -s SEED,  --seed SEED      RNG seed (default: -1)
-  -t N,     --threads N      number of threads to use during computation (default: 4)
-  -p N,     --processors N   number of processors to use during computation (default: 1)
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
-  -on N,    --offset-n N     segment index offset (default: 0)
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
-  -v,       --verbose        verbose output
-            --translate      translate from source language to english
-  -otxt,    --output-txt     output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
-  -ps,      --print_special  print special tokens
-  -pc,      --print_colors   print colors
-  -nt,      --no_timestamps  do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
-  -f FNAME, --file FNAME     input WAV file path
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path

 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -152,13 +154,13 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
-whisper_model_load: mem_required  = 670.00 MB
 whisper_model_load: adding 1607 extra tokens
+whisper_model_load: mem_required  =  506.00 MB
 whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |

 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -6,10 +6,16 @@
 #include <vector>
 #include <thread>

+std::thread g_worker;
+
 std::vector<struct whisper_context *> g_contexts(4, nullptr);

 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
                g_contexts[i] = whisper_init(path_model.c_str());
@ -25,6 +31,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));

    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        --index;

        if (index < g_contexts.size()) {
@ -34,6 +44,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));

    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        --index;

        if (index >= g_contexts.size()) {
@ -49,7 +63,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_realtime   = true;
        params.print_progress   = false;
        params.print_timestamps = true;
-        params.print_special_tokens = false;
+        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
@ -80,10 +94,15 @@ EMSCRIPTEN_BINDINGS(whisper) {
            printf("\n");
        }

-        int ret = whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
-
+        // run the worker
+        {
+            g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
+                whisper_reset_timings(g_contexts[index]);
+                whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
                whisper_print_timings(g_contexts[index]);
+            });
+        }

-        return ret;
+        return 0;
    }));
 }
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -20,9 +20,11 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
+    add_subdirectory(stream.wasm)
    add_subdirectory(talk.wasm)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
+    add_subdirectory(command)
    add_subdirectory(bench)
 endif()
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -17,14 +17,13 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
+        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
+        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -39,9 +38,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }

--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -0,0 +1,7 @@
+if (WHISPER_SUPPORT_SDL2)
+    # command
+    set(TARGET command)
+    add_executable(${TARGET} command.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -0,0 +1,28 @@
+# command
+
+This is a basic Voice Assistant example that accepts voice commands from the microphone.
+More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
+
+```bash
+# Run with default arguments and small model
+./command -m ./models/ggml-small.en.bin -t 8
+
+# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
+./command -m ./models/ggml-tiny.en.bin -ac 768 -t 4 -c 0
+```
+
+https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
+
+## Building
+
+The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2 on Linux
+sudo apt-get install libsdl2-dev
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+make command
+```
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -0,0 +1,655 @@
+// Voice assistant example
+//
+// Speak short text commands to the microphone.
+// This program will detect your voice command and convert them to text.
+//
+// ref: https://github.com/ggerganov/whisper.cpp/issues/171
+//
+
+#include "whisper.h"
+
+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t prompt_ms  = 5000;
+    int32_t command_ms = 4000;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;
+
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
+    bool speed_up      = false;
+    bool translate     = false;
+    bool no_context    = true;
+    bool print_special = false;
+    bool print_energy  = false;
+    bool no_timestamps = true;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+    std::string fname_out = "";
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -pms N,   --prompt-ms N   [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
+    fprintf(stderr, "  -cms N,   --command-ms N  [%-7d] command duration in milliseconds\n",            params.command_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
+
+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
+std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+    const auto t_start = std::chrono::high_resolution_clock::now();
+
+    prob = 0.0f;
+    t_ms = 0;
+
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+    wparams.print_progress   = false;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = !params.no_timestamps;
+    wparams.translate        = params.translate;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.max_tokens       = params.max_tokens;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = params.n_threads;
+
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;
+
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        return "";
+    }
+
+    int prob_n = 0;
+    std::string result;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+
+        result += text;
+
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const auto token = whisper_full_get_token_data(ctx, i, j);
+
+            prob += token.p;
+            ++prob_n;
+        }
+    }
+
+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
+    const auto t_end = std::chrono::high_resolution_clock::now();
+    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+
+    return result;
+}
+
+// compute similarity between two strings using Levenshtein distance
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init(params.model.c_str());
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        fprintf(stderr, "\n");
+    }
+
+
+    // init audio
+
+    audio_async audio(30*1000);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    bool is_running  = true;
+    bool have_prompt = false;
+    bool ask_prompt = true;
+
+    float prob0 = 0.0f;
+    float prob  = 0.0f;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";
+
+    // main loop
+    while (is_running) {
+        // handle Ctrl + C
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }
+
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        if (ask_prompt) {
+            fprintf(stdout, "\n");
+            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
+            fprintf(stdout, "\n");
+
+            ask_prompt = false;
+        }
+
+        int64_t t_ms = 0;
+
+        {
+            audio.get(2000, pcmf32_cur);
+
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+
+                if (!have_prompt) {
+                    audio.get(params.prompt_ms, pcmf32_cur);
+
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+
+                    const float sim = similarity(txt, k_prompt);
+
+                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
+                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
+                        ask_prompt = true;
+                    } else {
+                        fprintf(stdout, "\n");
+                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
+                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
+                        fprintf(stdout, "\n");
+
+                        // save the audio for the prompt
+                        pcmf32_prompt = pcmf32_cur;
+                        have_prompt = true;
+                    }
+                } else {
+                    audio.get(params.command_ms, pcmf32_cur);
+
+                    // prepend the prompt audio
+                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
+
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
+
+                    prob = 100.0f*(prob - prob0);
+
+                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
+
+                    // find the prompt in the text
+                    float best_sim = 0.0f;
+                    size_t best_len = 0;
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        const auto prompt = txt.substr(0, n);
+
+                        const float sim = similarity(prompt, k_prompt);
+
+                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
+
+                        if (sim > best_sim) {
+                            best_sim = sim;
+                            best_len = n;
+                        }
+                    }
+
+                    const std::string command = ::trim(txt.substr(best_len));
+
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    fprintf(stdout, "\n");
+                }
+
+                audio.clear();
+            }
+        }
+    }
+
+    audio.pause();
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -0,0 +1,182 @@
+// Common Javascript functions used by the examples
+
+function convertTypedArray(src, type) {
+    var buffer = new ArrayBuffer(src.byteLength);
+    var baseView = new src.constructor(buffer).set(src);
+    return new type(buffer);
+}
+
+var printTextarea = (function() {
+    var element = document.getElementById('output');
+    if (element) element.alue = ''; // clear browser cache
+    return function(text) {
+        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
+        console.log(text);
+        if (element) {
+            element.value += text + "\n";
+            element.scrollTop = element.scrollHeight; // focus on bottom
+        }
+    };
+})();
+
+async function clearCache() {
+    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
+        indexedDB.deleteDatabase(dbName);
+    }
+}
+
+// fetch a remote file from remote URL using the Fetch API
+async function fetchRemote(url, cbProgress, cbPrint) {
+    cbPrint('fetchRemote: downloading with fetch()...');
+
+    const response = await fetch(
+        url,
+        {
+            method: 'GET',
+            headers: {
+                'Content-Type': 'application/octet-stream',
+            },
+        }
+    );
+
+    if (!response.ok) {
+        cbPrint('fetchRemote: failed to fetch ' + url);
+        return;
+    }
+
+    const contentLength = response.headers.get('content-length');
+    const total = parseInt(contentLength, 10);
+    const reader = response.body.getReader();
+
+    var chunks = [];
+    var receivedLength = 0;
+    var progressLast = -1;
+
+    while (true) {
+        const { done, value } = await reader.read();
+
+        if (done) {
+            break;
+        }
+
+        chunks.push(value);
+        receivedLength += value.length;
+
+        if (contentLength) {
+            cbProgress(receivedLength/total);
+
+            var progressCur = Math.round((receivedLength / total) * 10);
+            if (progressCur != progressLast) {
+                cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
+                progressLast = progressCur;
+            }
+        }
+    }
+
+    var position = 0;
+    var chunksAll = new Uint8Array(receivedLength);
+
+    for (var chunk of chunks) {
+        chunksAll.set(chunk, position);
+        position += chunk.length;
+    }
+
+    return chunksAll;
+}
+
+// load remote data
+// - check if the data is already in the IndexedDB
+// - if not, fetch it from the remote URL and store it in the IndexedDB
+function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
+    // query the storage quota and print it
+    navigator.storage.estimate().then(function (estimate) {
+        cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
+        cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
+    });
+
+    // check if the data is already in the IndexedDB
+    var rq = indexedDB.open(dbName, dbVersion);
+
+    rq.onupgradeneeded = function (event) {
+        var db = event.target.result;
+        if (db.version == 1) {
+            var os = db.createObjectStore('models', { autoIncrement: false });
+            cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
+        } else {
+            // clear the database
+            var os = event.currentTarget.transaction.objectStore('models');
+            os.clear();
+            cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
+        }
+    };
+
+    rq.onsuccess = function (event) {
+        var db = event.target.result;
+        var tx = db.transaction(['models'], 'readonly');
+        var os = tx.objectStore('models');
+        var rq = os.get(url);
+
+        rq.onsuccess = function (event) {
+            if (rq.result) {
+                cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
+                cbReady(dst, rq.result);
+            } else {
+                // data is not in the IndexedDB
+                cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
+
+                // alert and ask the user to confirm
+                if (!confirm(
+                    'You are about to download ' + size_mb + ' MB of data.\n' +
+                    'The model data will be cached in the browser for future use.\n\n' +
+                    'Press OK to continue.')) {
+                    cbCancel();
+                    return;
+                }
+
+                fetchRemote(url, cbProgress, cbPrint).then(function (data) {
+                    if (data) {
+                        // store the data in the IndexedDB
+                        var rq = indexedDB.open(dbName, dbVersion);
+                        rq.onsuccess = function (event) {
+                            var db = event.target.result;
+                            var tx = db.transaction(['models'], 'readwrite');
+                            var os = tx.objectStore('models');
+                            var rq = os.put(data, url);
+
+                            rq.onsuccess = function (event) {
+                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
+                                cbReady(dst, data);
+                            };
+
+                            rq.onerror = function (event) {
+                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
+                                cbCancel();
+                            };
+                        };
+                    }
+                });
+            }
+        };
+
+        rq.onerror = function (event) {
+            cbPrint('loadRemote: failed to get data from the IndexedDB');
+            cbCancel();
+        };
+    };
+
+    rq.onerror = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB');
+        cbCancel();
+    };
+
+    rq.onblocked = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB: blocked');
+        cbCancel();
+    };
+
+    rq.onabort = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB: abort');
+
+    };
+}
+
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -6,29 +6,28 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 ```
 ./main -h

-usage: ./bin/main [options] file0.wav file1.wav ...
-
-  -h,       --help           show this help message and exit
-  -s SEED,  --seed SEED      RNG seed (default: -1)
-  -t N,     --threads N      number of threads to use during computation (default: 4)
-  -p N,     --processors N   number of processors to use during computation (default: 1)
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
-  -on N,    --offset-n N     segment index offset (default: 0)
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
-  -v,       --verbose        verbose output
-            --translate      translate from source language to english
-  -otxt,    --output-txt     output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
-  -ps,      --print_special  print special tokens
-  -pc,      --print_colors   print colors
-  -nt,      --no_timestamps  do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
-  -f FNAME, --file FNAME     input WAV file path
-  -h,       --help           show this help message and exit
+usage: ./main [options] file0.wav file1.wav ...

+options:
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -36,6 +36,10 @@ std::string to_timestamp(int64_t t, bool comma = false) {
    return std::string(buf);
 }

+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
 // helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
@ -48,7 +52,6 @@ void replace_all(std::string & s, const std::string & search, const std::string

 // command-line parameters
 struct whisper_params {
-    int32_t seed         = -1; // RNG seed, not used currently
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
@ -60,13 +63,13 @@ struct whisper_params {
    float word_thold = 0.01f;

    bool speed_up      = false;
-    bool verbose              = false;
    bool translate     = false;
+    bool diarize       = false;
    bool output_txt    = false;
    bool output_vtt    = false;
    bool output_srt    = false;
    bool output_wts    = false;
-    bool print_special_tokens = false;
+    bool print_special = false;
    bool print_colors  = false;
    bool no_timestamps = false;

@ -87,59 +90,32 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            continue;
        }

-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--processors") {
-            params.n_processors = std::stoi(argv[++i]);
-        } else if (arg == "-ot" || arg == "--offset-t") {
-            params.offset_t_ms = std::stoi(argv[++i]);
-        } else if (arg == "-on" || arg == "--offset-n") {
-            params.offset_n = std::stoi(argv[++i]);
-        } else if (arg == "-d" || arg == "--duration") {
-            params.duration_ms = std::stoi(argv[++i]);
-        } else if (arg == "-mc" || arg == "--max-context") {
-            params.max_context = std::stoi(argv[++i]);
-        } else if (arg == "-ml" || arg == "--max-len") {
-            params.max_len = std::stoi(argv[++i]);
-        } else if (arg == "-wt" || arg == "--word-thold") {
-            params.word_thold = std::stof(argv[++i]);
-        } else if (arg == "-su" || arg == "--speed-up") {
-            params.speed_up = true;
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--translate") {
-            params.translate = true;
-        } else if (arg == "-l" || arg == "--language") {
-            params.language = argv[++i];
-            if (whisper_lang_id(params.language.c_str()) == -1) {
-                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        } else if (arg == "-otxt" || arg == "--output-txt") {
-            params.output_txt = true;
-        } else if (arg == "-ovtt" || arg == "--output-vtt") {
-            params.output_vtt = true;
-        } else if (arg == "-osrt" || arg == "--output-srt") {
-            params.output_srt = true;
-        } else if (arg == "-owts" || arg == "--output-words") {
-            params.output_wts = true;
-        } else if (arg == "-ps" || arg == "--print_special") {
-            params.print_special_tokens = true;
-        } else if (arg == "-pc" || arg == "--print_colors") {
-            params.print_colors = true;
-        } else if (arg == "-nt" || arg == "--no_timestamps") {
-            params.no_timestamps = true;
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-f" || arg == "--file") {
-            params.fname_inp.push_back(argv[++i]);
-        } else if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        } else {
+        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-p"    || arg == "--processors")    { params.n_processors  = std::stoi(argv[++i]); }
+        else if (arg == "-ot"   || arg == "--offset-t")      { params.offset_t_ms   = std::stoi(argv[++i]); }
+        else if (arg == "-on"   || arg == "--offset-n")      { params.offset_n      = std::stoi(argv[++i]); }
+        else if (arg == "-d"    || arg == "--duration")      { params.duration_ms   = std::stoi(argv[++i]); }
+        else if (arg == "-mc"   || arg == "--max-context")   { params.max_context   = std::stoi(argv[++i]); }
+        else if (arg == "-ml"   || arg == "--max-len")       { params.max_len       = std::stoi(argv[++i]); }
+        else if (arg == "-wt"   || arg == "--word-thold")    { params.word_thold    = std::stof(argv[++i]); }
+        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-di"   || arg == "--diarize")       { params.diarize       = true; }
+        else if (arg == "-otxt" || arg == "--output-txt")    { params.output_txt    = true; }
+        else if (arg == "-ovtt" || arg == "--output-vtt")    { params.output_vtt    = true; }
+        else if (arg == "-osrt" || arg == "--output-srt")    { params.output_srt    = true; }
+        else if (arg == "-owts" || arg == "--output-words")  { params.output_wts    = true; }
+        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")  { params.print_colors  = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps") { params.no_timestamps = true; }
+        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"    || arg == "--file")          { params.fname_inp.push_back(argv[++i]); }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -154,34 +130,40 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
-    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
-    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
-    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
-    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
-    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
-    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
-    fprintf(stderr, "  -su,      --speed-up       speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
-    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
-    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
-    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
-    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
-    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
-    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N  [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N    [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N    [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N    [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -otxt,    --output-txt    [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,    --output-vtt    [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,    --output-srt    [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -owts,    --output-words  [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,      --print-colors  [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }

+struct whisper_print_user_data {
+    const whisper_params * params;
+
+    const std::vector<std::vector<float>> * pcmf32s;
+};
+
 void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
-    const whisper_params & params = *(whisper_params *) user_data;
+    const auto & params  = *((whisper_print_user_data *) user_data)->params;
+    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

    const int n_segments = whisper_full_n_segments(ctx);

@ -195,7 +177,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
        if (params.no_timestamps) {
            if (params.print_colors) {
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -218,10 +200,37 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

+            std::string speaker = "";
+
+            if (params.diarize && pcmf32s.size() == 2) {
+                const int64_t n_samples = pcmf32s[0].size();
+
+                const int64_t is0 = timestamp_to_sample(t0, n_samples);
+                const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+                double energy0 = 0.0f;
+                double energy1 = 0.0f;
+
+                for (int64_t j = is0; j < is1; j++) {
+                    energy0 += fabs(pcmf32s[0][j]);
+                    energy1 += fabs(pcmf32s[1][j]);
+                }
+
+                if (energy0 > 1.1*energy1) {
+                    speaker = "(speaker 0)";
+                } else if (energy1 > 1.1*energy0) {
+                    speaker = "(speaker 1)";
+                } else {
+                    speaker = "(speaker ?)";
+                }
+
+                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
+            }
+
            if (params.print_colors) {
                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -233,13 +242,13 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);

-                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
            }
        }
    }
@ -267,7 +276,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return 9;
+        return false;
    }

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -432,16 +441,18 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 2;
    }

+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
    // whisper init

    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -454,53 +465,60 @@ int main(int argc, char ** argv) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];

+        std::vector<float> pcmf32; // mono-channel F32 PCM
+        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
        // WAV input
-        std::vector<float> pcmf32;
        {
            drwav wav;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin

            if (fname_inp == "-") {
-                std::vector<uint8_t> wav_data;
                {
                    uint8_t buf[1024];
                    while (true)
                    {
                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                        if (n == 0)
-                        {
+                        if (n == 0) {
                            break;
                        }
                        wav_data.insert(wav_data.end(), buf, buf + n);
                    }
                }

-                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false)
-                {
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false) {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
            else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-                return 4;
+                return 5;
            }

            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
-                return 5;
+                return 6;
+            }
+
+            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
+                return 6;
            }

            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
-                return 6;
+                return 8;
            }

            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
-                return 7;
+                return 9;
            }

-            int n = wav.totalPCMFrameCount;
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);

            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
@ -518,6 +536,18 @@ int main(int argc, char ** argv) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (int i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -555,7 +585,7 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_progress   = false;
            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.print_special    = params.print_special;
            wparams.translate        = params.translate;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;
@ -569,15 +599,17 @@ int main(int argc, char ** argv) {

            wparams.speed_up         = params.speed_up;

+            whisper_print_user_data user_data = { &params, &pcmf32s };
+
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &params;
+                wparams.new_segment_callback_user_data = &user_data;
            }

            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 8;
+                return 10;
            }
        }

--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -0,0 +1,47 @@
+#
+# libstream
+#
+
+set(TARGET libstream)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside stream.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libstream.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# stream.wasm
+#
+
+set(TARGET stream.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/stream.wasm/README.md
+++ b/examples/stream.wasm/README.md
@ -0,0 +1,20 @@
+# stream.wasm
+
+Real-time transcription in the browser using WebAssembly
+
+Online demo: https://whisper.ggerganov.com/stream/
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/stream.wasm/*       /path/to/html/
+cp bin/libstream.worker.js /path/to/html/
+```
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -0,0 +1,213 @@
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_transcribed   = "";
+
+std::vector<float> g_pcmf32;
+
+void stream_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+void stream_main(size_t index) {
+    stream_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    printf("stream: using %d threads\n", N_THREAD);
+
+    std::vector<float> pcmf32;
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    // 5 seconds interval
+    const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
+
+    while (g_running) {
+        stream_set_status("waiting for audio ...");
+
+        {
+            std::unique_lock<std::mutex> lock(g_mutex);
+
+            if (g_pcmf32.size() < 1024) {
+                lock.unlock();
+
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+                continue;
+            }
+
+            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
+            g_pcmf32.clear();
+        }
+
+        {
+            const auto t_start = std::chrono::high_resolution_clock::now();
+
+            stream_set_status("running whisper ...");
+
+            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+            if (ret != 0) {
+                printf("whisper_full() failed: %d\n", ret);
+                break;
+            }
+
+            const auto t_end = std::chrono::high_resolution_clock::now();
+
+            printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
+        }
+
+        {
+            std::string text_heard;
+
+            {
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                    printf("transcribed: %s\n", text);
+
+                    text_heard += text;
+                }
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                g_transcribed = text_heard;
+            }
+        }
+    }
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(stream) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        stream_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("get_transcribed", emscripten::optional_override([]() {
+        std::string transcribed;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            transcribed = std::move(g_transcribed);
+        }
+
+        return transcribed;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+}
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -0,0 +1,385 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <title>stream : Real-time Whisper transcription in WebAssembly</title>
+
+        <style>
+            #output {
+                width: 100%;
+                height: 100%;
+                margin: 0 auto;
+                margin-top: 10px;
+                border-left: 0px;
+                border-right: 0px;
+                padding-left: 0px;
+                padding-right: 0px;
+                display: block;
+                background-color: black;
+                color: white;
+                font-size: 10px;
+                font-family: 'Lucida Console', Monaco, monospace;
+                outline: none;
+                white-space: pre;
+                overflow-wrap: normal;
+                overflow-x: scroll;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="main-container">
+            <b>stream : Real-time Whisper transcription in WebAssembly</b>
+
+            <br><br>
+
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
+
+            <br><br>
+
+            <hr>
+
+            Select the model you would like to use, click the "Start" button and start speaking
+
+            <br><br>
+
+            <div id="model-whisper">
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <!--
+                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="input">
+                <button id="start"  onclick="onStart()" disabled>Start</button>
+                <button id="stop"   onclick="onStop()" disabled>Stop</button>
+                <button id="clear"  onclick="clearCache()">Clear Cache</button>
+            </div>
+
+            <br>
+
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>
+
+                <pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
+            </div>
+
+            <hr>
+
+            Debug output:
+            <textarea id="output" rows="20"></textarea>
+
+            <br>
+
+            <b>Troubleshooting</b>
+
+            <br><br>
+
+            The page does some heavy computations, so make sure:
+
+            <ul>
+                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+            </ul>
+
+            <div class="cell-version">
+                <span>
+                    |
+                    Build time: <span class="nav-link">@GIT_DATE@</span> |
+                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
+                </span>
+            </div>
+        </div>
+
+        <script type="text/javascript" src="helpers.js"></script>
+        <script type='text/javascript'>
+            const kRestartRecording_s = 15;
+            const kSampleRate = 16000;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+            // web audio context
+            var context = null;
+
+            // audio data
+            var audio = null;
+            var audio0 = null;
+
+            // the stream instance
+            var instance = null;
+
+            // model name
+            var model_whisper = null;
+
+            var Module = {
+                print: printTextarea,
+                printErr: printTextarea,
+                setStatus: function(text) {
+                    printTextarea('js: ' + text);
+                },
+                monitorRunDependencies: function(left) {
+                },
+                preRun: function() {
+                    printTextarea('js: Preparing ...');
+                },
+                postRun: function() {
+                    printTextarea('js: Initialized successfully!');
+                }
+            };
+
+            //
+            // fetch models
+            //
+
+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                if (model_whisper != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop' ).disabled = true;
+                }
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'base.en': 142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            //
+            // microphone
+            //
+
+            var mediaRecorder = null;
+            var doRecording = false;
+            var startTime = 0;
+
+            function stopRecording() {
+                Module.set_status("paused");
+                doRecording = false;
+                audio0 = null;
+                audio = null;
+                context = null;
+            }
+
+            function startRecording() {
+                if (!context) {
+                    context = new AudioContext({
+                        sampleRate: 16000,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
+                }
+
+                Module.set_status("");
+
+                document.getElementById('start').disabled = true;
+                document.getElementById('stop').disabled = false;
+
+                doRecording = true;
+                startTime = Date.now();
+
+                var chunks = [];
+                var stream = null;
+
+                navigator.mediaDevices.getUserMedia({audio: true, video: false})
+                    .then(function(s) {
+                        stream = s;
+                        mediaRecorder = new MediaRecorder(stream);
+                        mediaRecorder.ondataavailable = function(e) {
+                            chunks.push(e.data);
+
+                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+                            var reader = new FileReader();
+
+                            reader.onload = function(event) {
+                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
+                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
+                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+                                    var source = offlineContext.createBufferSource();
+                                    source.buffer = audioBuffer;
+                                    source.connect(offlineContext.destination);
+                                    source.start(0);
+
+                                    offlineContext.startRendering().then(function(renderedBuffer) {
+                                        audio = renderedBuffer.getChannelData(0);
+
+                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+                                        if (audio0 != null) {
+                                            audioAll.set(audio0, 0);
+                                        }
+                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+                                        if (instance) {
+                                            Module.set_audio(instance, audioAll);
+                                        }
+                                    });
+                                }, function(e) {
+                                    audio = null;
+                                });
+                            }
+
+                            reader.readAsArrayBuffer(blob);
+                        };
+
+                        mediaRecorder.onstop = function(e) {
+                            if (doRecording) {
+                                setTimeout(function() {
+                                    startRecording();
+                                });
+                            }
+                        };
+
+                        mediaRecorder.start(5000);
+                    })
+                    .catch(function(err) {
+                        printTextarea('js: error getting audio stream: ' + err);
+                    });
+
+                var interval = setInterval(function() {
+                    if (!doRecording) {
+                        clearInterval(interval);
+                        mediaRecorder.stop();
+                        stream.getTracks().forEach(function(track) {
+                            track.stop();
+                        });
+
+                        document.getElementById('start').disabled = false;
+                        document.getElementById('stop').disabled  = true;
+
+                        mediaRecorder = null;
+                    }
+
+                    // if audio length is more than kRestartRecording_s seconds, restart recording
+                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+                        if (doRecording) {
+                            //printTextarea('js: restarting recording');
+
+                            clearInterval(interval);
+                            audio0 = audio;
+                            audio = null;
+                            mediaRecorder.stop();
+                            stream.getTracks().forEach(function(track) {
+                                track.stop();
+                            });
+                        }
+                    }
+                }, 250);
+            }
+
+            //
+            // main
+            //
+
+            var nLines = 0;
+            var intervalUpdate = null;
+            var transcribedAll = '';
+
+            function onStart() {
+                if (!instance) {
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                }
+
+                if (!instance) {
+                    printTextarea("js: failed to initialize whisper");
+                    return;
+                }
+
+                startRecording();
+
+                intervalUpdate = setInterval(function() {
+                    var transcribed = Module.get_transcribed();
+
+                    if (transcribed != null && transcribed.length > 1) {
+                        transcribedAll += transcribed + '<br>';
+                        nLines++;
+
+                        // if more than 10 lines, remove the first line
+                        if (nLines > 10) {
+                            var i = transcribedAll.indexOf('<br>');
+                            if (i > 0) {
+                                transcribedAll = transcribedAll.substring(i + 4);
+                                nLines--;
+                            }
+                        }
+                    }
+
+                    document.getElementById('state-status').innerHTML = Module.get_status();
+                    document.getElementById('state-transcribed').innerHTML = transcribedAll;
+                }, 100);
+            }
+
+            function onStop() {
+                stopRecording();
+            }
+
+        </script>
+        <script type="text/javascript" src="stream.js"></script>
+    </body>
+</html>
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -21,3 +21,7 @@ brew install sdl2

 make stream
 ```
+
+## Web version
+
+This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -4,11 +4,6 @@

 #include "whisper.h"

-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
 #include <SDL.h>
 #include <SDL_audio.h>

@ -35,7 +30,6 @@ std::string to_timestamp(int64_t t) {

 // command-line parameters
 struct whisper_params {
-    int32_t seed       = -1; // RNG seed, not used currently
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
@ -44,10 +38,9 @@ struct whisper_params {
    int32_t audio_ctx  = 0;

    bool speed_up      = false;
-    bool verbose              = false;
    bool translate     = false;
    bool no_context    = true;
-    bool print_special_tokens = false;
+    bool print_special = false;
    bool no_timestamps = true;

    std::string language  = "en";
@ -61,47 +54,24 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "--step") {
-            params.step_ms = std::stoi(argv[++i]);
-        } else if (arg == "--length") {
-            params.length_ms = std::stoi(argv[++i]);
-        } else if (arg == "-c" || arg == "--capture") {
-            params.capture_id = std::stoi(argv[++i]);
-        } else if (arg == "-mt" || arg == "--max_tokens") {
-            params.max_tokens = std::stoi(argv[++i]);
-        } else if (arg == "-ac" || arg == "--audio_ctx") {
-            params.audio_ctx = std::stoi(argv[++i]);
-        } else if (arg == "-su" || arg == "--speed-up") {
-            params.speed_up = true;
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--translate") {
-            params.translate = true;
-        } else if (arg == "-kc" || arg == "--keep-context") {
-            params.no_context = false;
-        } else if (arg == "-l" || arg == "--language") {
-            params.language = argv[++i];
-            if (whisper_lang_id(params.language.c_str()) == -1) {
-                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        } else if (arg == "-ps" || arg == "--print_special") {
-            params.print_special_tokens = true;
-        } else if (arg == "-nt" || arg == "--no_timestamps") {
-            params.no_timestamps = true;
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-f" || arg == "--file") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        } else {
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -116,23 +86,20 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
-    fprintf(stderr, "            --length N       audio length in milliseconds (default: %d)\n", params.length_ms);
-    fprintf(stderr, "  -c ID,    --capture ID     capture device ID (default: -1)\n");
-    fprintf(stderr, "  -mt N,    --max_tokens N   maximum number of tokens per audio chunk (default: %d)\n", params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio_ctx N    audio context size (default: %d, 0 - all)\n", params.audio_ctx);
-    fprintf(stderr, "  -su,      --speed-up       speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
-    fprintf(stderr, "  -kc,      --keep-context   keep text context from earlier audio (default: false)\n");
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
-    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME     text output file name (default: no output to file)\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
+    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }

@ -148,7 +115,6 @@ bool audio_sdl_init(const int capture_id) {
        return false;
    }

-    if (g_dev_id_in == 0) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);

    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
@ -165,9 +131,7 @@ bool audio_sdl_init(const int capture_id) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
-    }

-    if (g_dev_id_in == 0) {
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;

@ -196,8 +160,6 @@ bool audio_sdl_init(const int capture_id) {
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
    }
-    }
-

    return true;
 }
@ -211,10 +173,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
    // init audio

    if (!audio_sdl_init(params.capture_id)) {
@ -222,6 +180,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
    // whisper init

    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -280,7 +244,8 @@ int main(int argc, char ** argv) {

    // main audio loop
    while (is_running) {
-        // process SDL events:
+        // handle Ctrl + C
+        {
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
                switch (event.type) {
@ -296,6 +261,11 @@ int main(int argc, char ** argv) {
            if (!is_running) {
                break;
            }
+        }
+
+        if (!is_running) {
+            break;
+        }

        // process new audio
        if (n_iter > 0 && SDL_GetQueuedAudioSize(g_dev_id_in) > 2*n_samples*sizeof(float)) {
@ -332,7 +302,7 @@ int main(int argc, char ** argv) {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

            wparams.print_progress   = false;
-            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.print_special    = params.print_special;
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
@ -414,6 +384,10 @@ int main(int argc, char ** argv) {
        }
    }

+    if (g_dev_id_in >= 0) {
+        SDL_CloseAudioDevice(g_dev_id_in);
+    }
+
    whisper_print_timings(ctx);
    whisper_free(ctx);

--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -45,3 +45,4 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
 set(TARGET talk.wasm)

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -2,9 +2,9 @@

 Talk with an Artificial Intelligence in your browser:

-https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4
+[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)

-Online demo: https://talk.ggerganov.com
+Online demo: https://whisper.ggerganov.com/talk/

 ## How it works?

@ -50,6 +50,21 @@ on a phone or a tablet. Hopefully, in the near future this will become supported
 - Better UI (contributions are welcome)
 - Better GPT-2 prompting

+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/talk.wasm/*       /path/to/html/
+cp bin/libtalk.worker.js /path/to/html/
+```
+
 ## Feedback

 If you have any comments or ideas for improvement, please drop a comment in the following discussion:
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -59,10 +59,10 @@ void talk_main(size_t index) {
    wparams.print_realtime   = false;
    wparams.print_progress   = false;
    wparams.print_timestamps = true;
-    wparams.print_special_tokens = false;
+    wparams.print_special    = false;

    wparams.max_tokens       = 32;
-    wparams.audio_ctx            = 768;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance

    wparams.language         = "en";

@ -76,8 +76,8 @@ void talk_main(size_t index) {
    auto & ctx = g_contexts[index];

    const int64_t step_samples   = 2*WHISPER_SAMPLE_RATE;
-    const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
    const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
+    const int64_t step_ms        = (step_samples*1000)/WHISPER_SAMPLE_RATE;

    auto t_last = std::chrono::high_resolution_clock::now();

@ -111,7 +111,7 @@ void talk_main(size_t index) {
            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
        }

-        // if energy in during last second is above threshold, then skip
+        // VAD: if energy in during last second is above threshold, then skip
        {
            float energy_all = 0.0f;
            float energy_1s  = 0.0f;
@ -133,13 +133,11 @@ void talk_main(size_t index) {
            }
        }

-        talk_set_status("processing ...");
-
-        g_force_speak = false;
+        talk_set_status("processing audio (whisper)...");

        t_last = t_now;

-        {
+        if (!g_force_speak) {
            const auto t_start = std::chrono::high_resolution_clock::now();

            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
@ -156,6 +154,7 @@ void talk_main(size_t index) {
        {
            std::string text_heard;

+            if (!g_force_speak) {
                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = n_segments - 1; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
@ -167,6 +166,9 @@ void talk_main(size_t index) {

                    text_heard += text;
                }
+            }
+
+            g_force_speak = false;

            // remove text between brackets using regex
            {
@ -190,7 +192,7 @@ void talk_main(size_t index) {
            text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
            text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");

-            talk_set_status("'" + text_heard + "' - thinking how to respond ...");
+            talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");

            const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());

--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -812,9 +812,9 @@ I'm fine, thanks. How are you?
 Thanks, I'm fine too. What are you doing?
 I'm just sitting here.
 It's a lovely day, isn't it?
-Yes, it is.
-Did you know that I'm a robot?
-I wasn't aware of that.
+Yes, it is. I love the weather this time of year.
+I wish it would rain a little bit.
+Me too.
 )";

    std::mt19937 rng;
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -35,7 +35,7 @@

            <ul>
                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
-                <li><a href="https://github.com/ggerganov/ggml/tree/master/examples/gpt-2">OpenAI's GPT-2</a> to generate text responses</li>
+                <li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
            </ul>

@ -51,7 +51,7 @@
            <br><br>

            <div id="model-whisper">
-                <span id="model-whisper-status">Whisper model:</span>
+                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <span id="fetch-whisper-progress"></span>
@ -64,7 +64,7 @@
            <br>

            <div id="model-gpt-2">
-                <span id="model-gpt-2-status">GPT-2 model:</span>
+                GPT-2 model: <span id="model-gpt-2-status"></span>
                <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
                <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
                <span id="fetch-gpt-2-progress"></span>
@ -143,7 +143,7 @@

            <br><br>

-            Here is a short video of the demo in action: <a href="https://youtu.be/2om-7tFMaNs">https://youtu.be/2om-7tFMaNs</a>
+            Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>

            <br><br>

@ -158,20 +158,8 @@
            </div>
        </div>

+        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
-            var printTextarea = (function() {
-                    var element = document.getElementById('output');
-                    if (element) element.alue = ''; // clear browser cache
-                    return function(text) {
-                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
-                        console.log(text);
-                        if (element) {
-                            element.value += text + "\n";
-                            element.scrollTop = element.scrollHeight; // focus on bottom
-                        }
-                    };
-                })();
-
            const kRestartRecording_s = 15;
            const kSampleRate = 16000;

@ -218,6 +206,7 @@
                    if (voices.length == 0) {
                        el.innerHTML = '<option value="0">No voices available</option>';
                    } else {
+                        // populate voice list
                        var n = 0;
                        voices.forEach(function(voice, i) {
                            if (!voice.lang.startsWith('en')) return;
@ -245,17 +234,14 @@
                }
            };

-            // helper function
-            function convertTypedArray(src, type) {
-                var buffer = new ArrayBuffer(src.byteLength);
-                var baseView = new src.constructor(buffer).set(src);
-                return new type(buffer);
-            }
-
            //
            // fetch models
            //

+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
            function storeFS(fname, buf) {
                // write to WASM file using FS_createDataFile
                // if the file exists, delete it
@ -267,180 +253,25 @@

                Module.FS_createDataFile("/", fname, buf, true, true);

-                printTextarea('js: stored model: ' + fname + ' size: ' + buf.length);
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);

                if (fname == 'whisper.bin') {
-                    document.getElementById('model-whisper').innerHTML = 'Whisper model: loaded "' + model_whisper + '"!';
+                    document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                } else if (fname == 'gpt-2.bin') {
-                    document.getElementById('model-gpt-2').innerHTML = 'GPT-2 model: loaded "' + model_gpt_2 + '"!';
+                    document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
                }

                if (model_whisper != null && model_gpt_2 != null) {
                    document.getElementById('start').disabled = false;
-                    document.getElementById('stop').disabled  = false;
+                    document.getElementById('stop' ).disabled = false;
                    document.getElementById('voice').disabled = false;
                }
            }

-            let dbVersion = 1
-            let dbName    = 'talk.ggerganov.com';
-            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
-
-            // fetch a remote file from remote URL using the Fetch API
-            async function fetchRemote(url, elProgress) {
-                printTextarea('js: downloading with fetch()...');
-
-                const response = await fetch(
-                    url,
-                    {
-                        method: 'GET',
-                        headers: {
-                            'Content-Type': 'application/octet-stream',
-                        },
-                    }
-                );
-
-                if (!response.ok) {
-                    printTextarea('js: failed to fetch ' + url);
-                    return;
-                }
-
-                const contentLength = response.headers.get('content-length');
-                const total = parseInt(contentLength, 10);
-                const reader = response.body.getReader();
-
-                var chunks = [];
-                var receivedLength = 0;
-                var progressLast = -1;
-
-                while (true) {
-                    const { done, value } = await reader.read();
-
-                    if (done) {
-                        break;
-                    }
-
-                    chunks.push(value);
-                    receivedLength += value.length;
-
-                    if (contentLength) {
-                        // update progress bar element with the new percentage
-                        elProgress.innerHTML = Math.round((receivedLength / total) * 100) + '%';
-
-                        var progressCur = Math.round((receivedLength / total) * 10);
-                        if (progressCur != progressLast) {
-                            printTextarea('js: fetching ' + 10*progressCur + '% ...');
-                            progressLast = progressCur;
-                        }
-                    }
-                }
-
-                var chunksAll = new Uint8Array(receivedLength);
-                var position = 0;
-                for (var chunk of chunks) {
-                    chunksAll.set(chunk, position);
-                    position += chunk.length;
-                }
-
-                return chunksAll;
-            }
-
-            // load remote data
-            // - check if the data is already in the IndexedDB
-            // - if not, fetch it from the remote URL and store it in the IndexedDB
-            // - store it in WASM memory
-            function loadRemote(url, dst, elProgress, size_mb) {
-                // query the storage quota and print it
-                navigator.storage.estimate().then(function (estimate) {
-                    printTextarea('js: storage quota: ' + estimate.quota + ' bytes');
-                    printTextarea('js: storage usage: ' + estimate.usage + ' bytes');
-                });
-
-                // check if the data is already in the IndexedDB
-                var request = indexedDB.open(dbName, dbVersion);
-
-                request.onupgradeneeded = function (event) {
-                    var db = event.target.result;
-                    if (db.version == 1) {
-                        var objectStore = db.createObjectStore('models', { autoIncrement: false });
-                        printTextarea('js: created IndexedDB ' + db.name + ' version ' + db.version);
-                    } else {
-                        // clear the database
-                        var objectStore = event.currentTarget.transaction.objectStore('models');
-                        objectStore.clear();
-                        printTextarea('js: cleared IndexedDB ' + db.name + ' version ' + db.version);
-                    }
-                };
-
-                request.onsuccess = function (event) {
-                    var db = event.target.result;
-                    var transaction = db.transaction(['models'], 'readonly');
-                    var objectStore = transaction.objectStore('models');
-                    var request = objectStore.get(url);
-
-                    request.onsuccess = function (event) {
-                        if (request.result) {
-                            printTextarea('js: "' + url + '" is already in the IndexedDB');
-                            storeFS(dst, request.result);
-                        } else {
-                            // data is not in the IndexedDB
-                            printTextarea('js: "' + url + '" is not in the IndexedDB');
-
-                            // alert and ask the user to confirm
-                            if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) {
-                                var el;
-                                el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                                el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-                                el = document.getElementById('fetch-gpt-2-small') ;    if (el) el.style.display = 'inline-block';
-                                return;
-                            }
-
-                            fetchRemote(url, elProgress).then(function (data) {
-                                if (data) {
-                                    // store the data in the IndexedDB
-                                    var request = indexedDB.open(dbName, dbVersion);
-                                    request.onsuccess = function (event) {
-                                        var db = event.target.result;
-                                        var transaction = db.transaction(['models'], 'readwrite');
-                                        var objectStore = transaction.objectStore('models');
-                                        var request = objectStore.put(data, url);
-
-                                        request.onsuccess = function (event) {
-                                            printTextarea('js: "' + url + '" stored in the IndexedDB');
-                                            storeFS(dst, data);
-                                        };
-
-                                        request.onerror = function (event) {
-                                            printTextarea('js: failed to store "' + url + '" in the IndexedDB');
-                                        };
-                                    };
-                                }
-                            });
-                        }
-                    };
-
-                    request.onerror = function (event) {
-                        printTextarea('js: failed to get data from the IndexedDB');
-                    };
-                };
-
-                request.onerror = function (event) {
-                    printTextarea('js: failed to open IndexedDB');
-                };
-
-                request.onblocked = function (event) {
-                    printTextarea('js: failed to open IndexedDB: blocked');
-                };
-
-                request.onabort = function (event) {
-                    printTextarea('js: failed to open IndexedDB: abort');
-                };
-            }
-
            function loadWhisper(model) {
                let urls = {
-                    'tiny.en': 'https://talk.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'base.en': 'https://talk.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                };

                let sizes = {
@ -450,22 +281,33 @@

                let url     = urls[model];
                let dst     = 'whisper.bin';
-                let el      = document.getElementById('fetch-whisper-progress');
                let size_mb = sizes[model];

                model_whisper = model;

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('model-whisper-status').innerHTML = 'Whisper model: loading "' + model + '" ... ';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };

-                loadRemote(url, dst, el, size_mb);
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }

            function loadGPT2(model) {
                let urls = {
-                    'small':  'https://talk.ggerganov.com/ggml-model-gpt-2-117M.bin',
-                    'medium': 'https://talk.ggerganov.com/ggml-model-gpt-2-345M.bin',
+                    'small':  'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
+                    'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
                };

                let sizes = {
@ -475,15 +317,25 @@

                let url     = urls[model];
                let dst     = 'gpt-2.bin';
-                let el      = document.getElementById('fetch-gpt-2-progress');
                let size_mb = sizes[model];

                model_gpt_2 = model;

                document.getElementById('fetch-gpt-2-small').style.display = 'none';
-                document.getElementById('model-gpt-2-status').innerHTML = 'GPT-2 model: loading "' + model + '" ... ';
+                document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-gpt-2-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };

-                loadRemote(url, dst, el, size_mb);
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }

            //
@ -507,7 +359,7 @@
                    context = new AudioContext({
                        sampleRate: 16000,
                        channelCount: 1,
-                        echoCancellation: true,
+                        echoCancellation: false,
                        autoGainControl:  true,
                        noiseSuppression: true,
                    });
@ -652,12 +504,6 @@
                Module.force_speak(instance);
            }

-            async function clearCache() {
-                if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
-                    indexedDB.deleteDatabase(dbName);
-                }
-            }
-
            //
            // main
            //
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -164,7 +164,7 @@ void AudioInputCallback(void * inUserData,
    params.print_realtime   = true;
    params.print_progress   = false;
    params.print_timestamps = true;
-    params.print_special_tokens = false;
+    params.print_special    = false;
    params.translate        = false;
    params.language         = "en";
    params.n_threads        = 4;
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -1,4 +1,5 @@
 set(TARGET whisper.wasm)

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
 configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -26,10 +26,9 @@ Link: https://whisper.ggerganov.com

 ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)

-
 ## Build instructions

-```bash
+```bash (v3.1.2)
 # build using Emscripten
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
@ -38,6 +37,6 @@ emcmake cmake ..
 make -j

 # copy the produced page to your HTTP path
-cp bin/whisper.wasm/index.html /path/to/html/
-cp bin/whisper.wasm/whisper.js /path/to/html/
+cp bin/whisper.wasm/*       /path/to/html/
 cp bin/libwhisper.worker.js /path/to/html/
+```
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -45,13 +45,14 @@
            <br><br><hr>

            <div id="model">
-                Model:
+                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"    onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"    onclick="loadWhisper('base')">base (142 MB)</button>
                <span id="fetch-whisper-progress"></span>
-                <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+
+                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
            </div>

            <br>
@ -185,6 +186,7 @@
            </div>
        </div>

+        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
            // TODO: convert audio buffer to WAV
            function setAudio(audio) {
@ -204,28 +206,15 @@
            function changeInput(input) {
                if (input == 'file') {
                    document.getElementById('input_file').style.display = 'block';
-                    document.getElementById('input_mic').style.display = 'none';
-                    document.getElementById('progress').style.display = 'none';
+                    document.getElementById('input_mic' ).style.display = 'none';
+                    document.getElementById('progress'  ).style.display = 'none';
                } else {
                    document.getElementById('input_file').style.display = 'none';
-                    document.getElementById('input_mic').style.display = 'block';
-                    document.getElementById('progress').style.display = 'block';
+                    document.getElementById('input_mic' ).style.display = 'block';
+                    document.getElementById('progress'  ).style.display = 'block';
                }
            }

-            var printTextarea = (function() {
-                    var element = document.getElementById('output');
-                    if (element) element.alue = ''; // clear browser cache
-                    return function(text) {
-                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
-                        console.log(text);
-                        if (element) {
-                            element.value += text + "\n";
-                            element.scrollTop = element.scrollHeight; // focus on bottom
-                        }
-                    };
-                })();
-
            var Module = {
                print: printTextarea,
                printErr: printTextarea,
@ -250,7 +239,7 @@

            // the whisper instance
            var instance = null;
-            var model_fname = '';
+            var model_whisper = '';

            // helper function
            function convertTypedArray(src, type) {
@ -278,8 +267,11 @@

                Module.FS_createDataFile("/", fname, buf, true, true);

-                model_fname = fname;
-                printTextarea('js: stored model: ' + fname + ' size: ' + buf.length);
+                model_whisper = fname;
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
            }

            function loadFile(event, fname) {
@ -288,8 +280,8 @@
                    return;
                }

-                printTextarea("js: loading model: " + file.name + ", size: " + file.size + " bytes");
-                printTextarea('js: please wait ...');
+                printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
+                printTextarea('loadFile: please wait ...');

                var reader = new FileReader();
                reader.onload = function(event) {
@ -300,160 +292,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny').style.display = 'none';
-                document.getElementById('fetch-whisper-base').style.display = 'none';
-            }
-
-            // fetch a remote file from remote URL using the Fetch API
-            async function fetchRemote(url, elProgress) {
-                printTextarea('js: downloading with fetch()...');
-
-                const response = await fetch(
-                    url,
-                    {
-                        method: 'GET',
-                        headers: {
-                            'Content-Type': 'application/octet-stream',
-                        },
-                    }
-                );
-
-                if (!response.ok) {
-                    printTextarea('js: failed to fetch ' + url);
-                    return;
-                }
-
-                const contentLength = response.headers.get('content-length');
-                const total = parseInt(contentLength, 10);
-                const reader = response.body.getReader();
-
-                var chunks = [];
-                var receivedLength = 0;
-                var progressLast = -1;
-
-                while (true) {
-                    const { done, value } = await reader.read();
-
-                    if (done) {
-                        break;
-                    }
-
-                    chunks.push(value);
-                    receivedLength += value.length;
-
-                    if (contentLength) {
-                        // update progress bar element with the new percentage
-                        elProgress.innerHTML = Math.round((receivedLength / total) * 100) + '%';
-
-                        var progressCur = Math.round((receivedLength / total) * 10);
-                        if (progressCur != progressLast) {
-                            printTextarea('js: fetching ' + 10*progressCur + '% ...');
-                            progressLast = progressCur;
-                        }
-                    }
-                }
-
-                var chunksAll = new Uint8Array(receivedLength);
-                var position = 0;
-                for (var chunk of chunks) {
-                    chunksAll.set(chunk, position);
-                    position += chunk.length;
-                }
-
-                return chunksAll;
-            }
-
-            // load remote data
-            // - check if the data is already in the IndexedDB
-            // - if not, fetch it from the remote URL and store it in the IndexedDB
-            // - store it in WASM memory
-            function loadRemote(url, dst, elProgress, size_mb) {
-                // query the storage quota and print it
-                navigator.storage.estimate().then(function (estimate) {
-                    printTextarea('js: storage quota: ' + estimate.quota + ' bytes');
-                    printTextarea('js: storage usage: ' + estimate.usage + ' bytes');
-                });
-
-                // check if the data is already in the IndexedDB
-                var request = indexedDB.open(dbName, dbVersion);
-
-                request.onupgradeneeded = function (event) {
-                    var db = event.target.result;
-                    if (db.version == 1) {
-                        var objectStore = db.createObjectStore('models', { autoIncrement: false });
-                        printTextarea('js: created IndexedDB ' + db.name + ' version ' + db.version);
-                    } else {
-                        // clear the database
-                        var objectStore = event.currentTarget.transaction.objectStore('models');
-                        objectStore.clear();
-                        printTextarea('js: cleared IndexedDB ' + db.name + ' version ' + db.version);
-                    }
-                };
-
-                request.onsuccess = function (event) {
-                    var db = event.target.result;
-                    var transaction = db.transaction(['models'], 'readonly');
-                    var objectStore = transaction.objectStore('models');
-                    var request = objectStore.get(url);
-
-                    request.onsuccess = function (event) {
-                        if (request.result) {
-                            printTextarea('js: "' + url + '" is already in the IndexedDB');
-                            storeFS(dst, request.result);
-                        } else {
-                            // data is not in the IndexedDB
-                            printTextarea('js: "' + url + '" is not in the IndexedDB');
-
-                            // alert and ask the user to confirm
-                            if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) {
-                                var el;
-                                el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                                el = document.getElementById('fetch-whisper-tiny'); if (el) el.style.display = 'inline-block';
-                                el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-                                el = document.getElementById('fetch-whisper-base'); if (el) el.style.display = 'inline-block';
-                                return;
-                            }
-
-                            fetchRemote(url, elProgress).then(function (data) {
-                                if (data) {
-                                    // store the data in the IndexedDB
-                                    var request = indexedDB.open(dbName, dbVersion);
-                                    request.onsuccess = function (event) {
-                                        var db = event.target.result;
-                                        var transaction = db.transaction(['models'], 'readwrite');
-                                        var objectStore = transaction.objectStore('models');
-                                        var request = objectStore.put(data, url);
-
-                                        request.onsuccess = function (event) {
-                                            printTextarea('js: "' + url + '" stored in the IndexedDB');
-                                            storeFS(dst, data);
-                                        };
-
-                                        request.onerror = function (event) {
-                                            printTextarea('js: failed to store "' + url + '" in the IndexedDB');
-                                        };
-                                    };
-                                }
-                            });
-                        }
-                    };
-
-                    request.onerror = function (event) {
-                        printTextarea('js: failed to get data from the IndexedDB');
-                    };
-                };
-
-                request.onerror = function (event) {
-                    printTextarea('js: failed to open IndexedDB');
-                };
-
-                request.onblocked = function (event) {
-                    printTextarea('js: failed to open IndexedDB: blocked');
-                };
-
-                request.onabort = function (event) {
-                    printTextarea('js: failed to open IndexedDB: abort');
-                };
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }

            function loadWhisper(model) {
@ -473,17 +315,33 @@

                let url     = urls[model];
                let dst     = 'whisper.bin';
-                let el      = document.getElementById('fetch-whisper-progress');
                let size_mb = sizes[model];

                model_whisper = model;

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny').style.display = 'none';
-                document.getElementById('fetch-whisper-base').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('whisper-file'         ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
+                };

-                loadRemote(url, dst, el, size_mb);
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }

            //
@ -651,7 +509,7 @@

                    if (instance) {
                        printTextarea("js: whisper initialized, instance: " + instance);
-                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_fname;
+                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_whisper;
                    }
                }

@ -668,7 +526,6 @@
                if (instance) {
                    printTextarea('');
                    printTextarea('js: processing - this might take a while ...');
-                    printTextarea('js: the page will be unresponsive until the processing is completed');
                    printTextarea('');

                    setTimeout(function() {
--- a/models/README.md
+++ b/models/README.md
@ -41,5 +41,24 @@ https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main

 ## Model files for testing purposes

-The model files pefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for testing purposes.
-They are directly included in this repository for convenience and the Github Actions CI uses them to run various sanitizer tests.
+The model files prefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for
+testing purposes. They are directly included in this repository for convenience and the Github Actions CI uses them to
+run various sanitizer tests.
+
+## Fine-tuned models
+
+There are community efforts for creating fine-tuned Whisper models using extra training data. For example, this
+[blog post](https://huggingface.co/blog/fine-tune-whisper) describes a method for fine-tuning using Hugging Face (HF)
+Transformer implementation of Whisper. The produced models are in slightly different format compared to the original
+OpenAI format. To read the HF models you can use the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script like this:
+
+```bash
+git clone https://github.com/openai/whisper
+git clone https://github.com/ggerganov/whisper.cpp
+
+# clone HF fine-tuned model (this is just an example)
+git clone https://huggingface.co/openai/whisper-base.en
+
+# convert the model to ggml
+python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
+```
--- a/whisper.cpp
+++ b/whisper.cpp
@ -518,15 +518,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
        wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
        wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-                   wctx.buf_model->size() +
-                   wctx.buf_memory.size() +
-                   wctx.buf_compute.size() +
-                   wctx.buf_compute_layer.size();
-
-        fprintf(stderr, "%s: mem_required  = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
    }

    // load mel filters
@ -599,11 +590,21 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        }
    }

+    {
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+                   wctx.buf_model->size() +
+                   wctx.buf_memory.size() +
+                   wctx.buf_compute.size() +
+                   wctx.buf_compute_layer.size();
+
+        fprintf(stderr, "%s: mem_required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
+    }
+
    // for the big tensors, we have the option to store the data in 16-bit floats
    // in order to save memory and also to speed up the computation
    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;

-
    size_t ctx_size = 0;
    size_t ctx_mem_size = 0;

@ -722,7 +723,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead

-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
@ -983,7 +984,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
            ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);

-        fprintf(stderr, "%s: memory size = %8.2f MB\n", __func__, memory_size/1024.0/1024.0);
+        fprintf(stderr, "%s: memory size   = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
    }

    // load weights
@ -1047,7 +1048,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            model.n_loaded++;
        }

-        fprintf(stderr, "%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+        fprintf(stderr, "%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);

        if (model.n_loaded == 0) {
            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
@ -2379,6 +2380,12 @@ void whisper_print_timings(struct whisper_context * ctx) {
    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
 }

+void whisper_reset_timings(struct whisper_context * ctx) {
+    ctx->t_sample_us = 0;
+    ctx->t_encode_us = 0;
+    ctx->t_decode_us = 0;
+}
+
 ////////////////////////////////////////////////////////////////////////////

 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2398,7 +2405,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                    /*.translate        =*/ false,
                    /*.no_context       =*/ false,
                    /*.single_segment   =*/ false,
-                    /*.print_special_tokens =*/ false,
+                    /*.print_special    =*/ false,
                    /*.print_progress   =*/ true,
                    /*.print_realtime   =*/ false,
                    /*.print_timestamps =*/ true,
@ -2444,7 +2451,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                    /*.translate        =*/ false,
                    /*.no_context       =*/ false,
                    /*.single_segment   =*/ false,
-                    /*.print_special_tokens =*/ false,
+                    /*.print_special    =*/ false,
                    /*.print_progress   =*/ true,
                    /*.print_realtime   =*/ false,
                    /*.print_timestamps =*/ true,
@ -2761,7 +2768,7 @@ int whisper_full(
                //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
                //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);

-                if (params.print_special_tokens == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
+                if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
                } else {
                    text += whisper_token_to_str(ctx, tokens_cur[i].id);
                }
--- a/whisper.h
+++ b/whisper.h
@ -167,6 +167,7 @@ extern "C" {

    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
+    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

    ////////////////////////////////////////////////////////////////////////////

@ -192,7 +193,7 @@ extern "C" {
        bool translate;
        bool no_context;
        bool single_segment; // force single segment output (useful for streaming)
-        bool print_special_tokens;
+        bool print_special;
        bool print_progress;
        bool print_realtime;
        bool print_timestamps;