From cb70b07db5a5a1ee41aa6ed4859e35908fc2d120 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 10:05:37 +0200
Subject: [PATCH 01/18] livestream.sh : simple tool to transcribe audio
 livestreams (#185)

---
 examples/livestream.sh              | 69 +++++++++++++++++++++++++++++
 examples/stream.wasm/emscripten.cpp |  2 +-
 examples/talk.wasm/emscripten.cpp   |  2 +-
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 examples/livestream.sh
diff --git a/examples/livestream.sh b/examples/livestream.sh
new file mode 100755
index 0000000..18893a3
--- /dev/null
+++ b/examples/livestream.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
+# Idea by @semiformal-net
+# ref: https://github.com/ggerganov/whisper.cpp/issues/185
+#
+# TODO:
+# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
+#   way to produce a continuous stream of audio chunks.
+#
+
+url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
+step_ms=10000
+model="base.en"
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 stream_url [step_ms] [model]"
+    echo ""
+    echo "  Example:"
+    echo "    $0 $url $step_ms $model"
+    echo ""
+    echo "No url specified, using default: $url"
+else
+    url="$1"
+fi
+
+if [ -n "$2" ]; then
+    step_ms="$2"
+fi
+
+if [ -n "$3" ]; then
+    model="$3"
+fi
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+running=1
+
+trap "running=0" SIGINT SIGTERM
+
+printf "[+] Transcribing stream with model '$model', step_ms $step_ms (press Ctrl+C to stop):\n\n"
+
+while [ $running -eq 1 ]; do
+    ffmpeg -y -re -probesize 32 -i $url -ar 16000 -ac 1 -c:a pcm_s16le -t ${step_ms}ms /tmp/whisper-live0.wav > /dev/null 2> /tmp/whisper-live.err
+    if [ $? -ne 0 ]; then
+        printf "Error: ffmpeg failed to capture audio stream\n"
+        exit 1
+    fi
+    mv /tmp/whisper-live0.wav /tmp/whisper-live.wav
+    ./main -t 8 -m ./models/ggml-small.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1 &
+done
diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp
index f8e3e27..b75eee3 100644
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@@ -51,7 +51,7 @@ void stream_main(size_t index) {
 
     wparams.language         = "en";
 
-    printf("stream: using %d threads\n", N_THREAD);
+    printf("stream: using %d threads\n", wparams.n_threads);
 
     std::vector<float> pcmf32;
 
diff --git a/examples/talk.wasm/emscripten.cpp b/examples/talk.wasm/emscripten.cpp
index 501c459..c82f469 100644
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@@ -68,7 +68,7 @@ void talk_main(size_t index) {
 
     g_gpt2 = gpt2_init("gpt-2.bin");
 
-    printf("talk: using %d threads\n", N_THREAD);
+    printf("talk: using %d threads\n", wparams.n_threads);
 
     std::vector<float> pcmf32;
 

From c536ff40051502d5692cf3467d40add40a2ca45f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 10:22:42 +0200
Subject: [PATCH 02/18] minor : add comment for using "generate_karaoke.sh"

---
 examples/generate-karaoke.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/generate-karaoke.sh b/examples/generate-karaoke.sh
index c5e7c44..7062c9a 100755
--- a/examples/generate-karaoke.sh
+++ b/examples/generate-karaoke.sh
@@ -1,5 +1,16 @@
 #!/bin/bash
 
+# Simple tool to record audio from the microphone and generate a karaoke video
+# Usage:
+#
+#  cd whisper.cpp
+#  make
+#
+#  ./examples/generate-karaoke.sh [model] [step_ms]
+#
+# Press Ctrl+C to stop recording
+#
+
 executable="./main"
 model="base.en"
 model_path="models/ggml-$model.bin"

From 68ecadbbc9ff7a969545983f8fa3ca91d4f6e614 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 11:40:06 +0200
Subject: [PATCH 03/18] command.wasm : add voice assistant example for the Web
 (#171)

Same as the command-line tool "command", but runs in the browser

Also, added helper script "extra/deploy-wasm.sh" and fixed some timing
constants for the WASM examples.
---
 README.md                             |   2 +-
 examples/CMakeLists.txt               |   1 +
 examples/command.wasm/CMakeLists.txt  |  47 +++
 examples/command.wasm/README.md       |  23 ++
 examples/command.wasm/emscripten.cpp  | 408 ++++++++++++++++++++++++++
 examples/command.wasm/index-tmpl.html | 386 ++++++++++++++++++++++++
 examples/command/README.md            |   2 +
 examples/command/command.cpp          |   2 +-
 examples/stream.wasm/index-tmpl.html  |  19 +-
 examples/talk.wasm/index-tmpl.html    |  19 +-
 examples/whisper.wasm/index-tmpl.html |  28 +-
 extra/deploy-wasm.sh                  |  30 ++
 12 files changed, 939 insertions(+), 28 deletions(-)
 create mode 100644 examples/command.wasm/CMakeLists.txt
 create mode 100644 examples/command.wasm/README.md
 create mode 100644 examples/command.wasm/emscripten.cpp
 create mode 100644 examples/command.wasm/index-tmpl.html
 create mode 100755 extra/deploy-wasm.sh

diff --git a/README.md b/README.md
index 5c22979..ab0d882 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ As an example, here is a video of running the model on an iPhone 13 device - ful
 
 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
 
-You can also easily make your own offline voice assistant application:
+You can also easily make your own offline voice assistant application: [command](examples/command)
 
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e798d1f..b03694e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,6 +21,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
     add_subdirectory(whisper.wasm)
     add_subdirectory(stream.wasm)
+    add_subdirectory(command.wasm)
     add_subdirectory(talk.wasm)
 else()
     add_subdirectory(main)
diff --git a/examples/command.wasm/CMakeLists.txt b/examples/command.wasm/CMakeLists.txt
new file mode 100644
index 0000000..27fd0ab
--- /dev/null
+++ b/examples/command.wasm/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# libcommand
+#
+
+set(TARGET libcommand)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside command.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libcommand.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# command.wasm
+#
+
+set(TARGET command.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md
new file mode 100644
index 0000000..a6e0cf1
--- /dev/null
+++ b/examples/command.wasm/README.md
@@ -0,0 +1,23 @@
+# command.wasm
+
+This is a basic Voice Assistant example that accepts voice commands from the microphone.
+It runs in fully in the browser via WebAseembly.
+
+Online demo: https://whisper.ggerganov.com/command/
+
+Terminal version: https://github.com/ggerganov/whisper.cpp/examples/command
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/command.wasm/*       /path/to/html/
+cp bin/libcommand.worker.js /path/to/html/
+```
diff --git a/examples/command.wasm/emscripten.cpp b/examples/command.wasm/emscripten.cpp
new file mode 100644
index 0000000..d4bbb21
--- /dev/null
+++ b/examples/command.wasm/emscripten.cpp
@@ -0,0 +1,408 @@
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex  g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_transcribed   = "";
+
+std::vector<float> g_pcmf32;
+
+static std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+// compute similarity between two strings using Levenshtein distance
+static float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
+
+void command_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
+std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+    const auto t_start = std::chrono::high_resolution_clock::now();
+
+    prob = 0.0f;
+    t_ms = 0;
+
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        return "";
+    }
+
+    int prob_n = 0;
+    std::string result;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+
+        result += text;
+
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const auto token = whisper_full_get_token_data(ctx, i, j);
+
+            prob += token.p;
+            ++prob_n;
+        }
+    }
+
+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
+    const auto t_end = std::chrono::high_resolution_clock::now();
+    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+
+    return result;
+}
+
+void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
+    const int64_t n_samples = (ms * sample_rate) / 1000;
+
+    int64_t n_take = 0;
+    if (g_pcmf32.size() < n_samples) {
+        n_take = g_pcmf32.size();
+    } else {
+        n_take = n_samples;
+    }
+
+    audio.resize(n_take);
+    std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
+}
+
+void command_main(size_t index) {
+    command_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    printf("command: using %d threads\n", wparams.n_threads);
+
+    bool is_running   = true;
+    bool have_prompt  = false;
+    bool ask_prompt   = true;
+    bool print_energy = false;
+
+    float prob0 = 0.0f;
+    float prob  = 0.0f;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    const int32_t vad_ms     = 2000;
+    const int32_t prompt_ms  = 5000;
+    const int32_t command_ms = 4000;
+
+    const float vad_thold  = 0.1f;
+    const float freq_thold = -1.0f;
+
+    while (g_running) {
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        if (ask_prompt) {
+            fprintf(stdout, "\n");
+            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
+            fprintf(stdout, "\n");
+
+            {
+                char txt[1024];
+                snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
+                command_set_status(txt);
+            }
+
+            ask_prompt = false;
+        }
+
+        int64_t t_ms = 0;
+
+        {
+            command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
+                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+                command_set_status("Speech detected! Processing ...");
+
+                if (!have_prompt) {
+                    command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+                    const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+
+                    const float sim = similarity(txt, k_prompt);
+
+                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
+                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
+                        ask_prompt = true;
+                    } else {
+                        fprintf(stdout, "\n");
+                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
+                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
+                        fprintf(stdout, "\n");
+
+                        {
+                            char txt[1024];
+                            snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
+                            command_set_status(txt);
+                        }
+
+                        // save the audio for the prompt
+                        pcmf32_prompt = pcmf32_cur;
+                        have_prompt = true;
+                    }
+                } else {
+                    command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+                    // prepend the prompt audio
+                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
+
+                    const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
+
+                    prob = 100.0f*(prob - prob0);
+
+                    fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
+
+                    // find the prompt in the text
+                    float best_sim = 0.0f;
+                    size_t best_len = 0;
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        const auto prompt = txt.substr(0, n);
+
+                        const float sim = similarity(prompt, k_prompt);
+
+                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
+
+                        if (sim > best_sim) {
+                            best_sim = sim;
+                            best_len = n;
+                        }
+                    }
+
+                    const std::string command = ::trim(txt.substr(best_len));
+
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    fprintf(stdout, "\n");
+
+                    {
+                        char txt[1024];
+                        snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
+                        command_set_status(txt);
+                    }
+                    {
+                        std::lock_guard<std::mutex> lock(g_mutex);
+                        g_transcribed = command;
+                    }
+                }
+
+                g_pcmf32.clear();
+            }
+        }
+    }
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(command) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        command_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("get_transcribed", emscripten::optional_override([]() {
+        std::string transcribed;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            transcribed = std::move(g_transcribed);
+        }
+
+        return transcribed;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+}
diff --git a/examples/command.wasm/index-tmpl.html b/examples/command.wasm/index-tmpl.html
new file mode 100644
index 0000000..08670a1
--- /dev/null
+++ b/examples/command.wasm/index-tmpl.html
@@ -0,0 +1,386 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <title>command : Voice assistant example using Whisper + WebAssembly</title>
+
+        <style>
+            #output {
+                width: 100%;
+                height: 100%;
+                margin: 0 auto;
+                margin-top: 10px;
+                border-left: 0px;
+                border-right: 0px;
+                padding-left: 0px;
+                padding-right: 0px;
+                display: block;
+                background-color: black;
+                color: white;
+                font-size: 10px;
+                font-family: 'Lucida Console', Monaco, monospace;
+                outline: none;
+                white-space: pre;
+                overflow-wrap: normal;
+                overflow-x: scroll;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="main-container">
+            <b>command : Voice assistant example using Whisper + WebAssembly</b>
+
+            <br><br>
+
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
+
+            <br><br>
+
+            <hr>
+
+            Select the model you would like to use, click the "Start" button and follow the instructions.
+
+            <br><br>
+
+            <div id="model-whisper">
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <!--
+                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="input">
+                <button id="start" onclick="onStart()" disabled>Start</button>
+                <button id="stop"  onclick="onStop()" disabled>Stop</button>
+                <button id="clear" onclick="clearCache()">Clear Cache</button>
+            </div>
+
+            <br>
+
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>
+
+                <pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
+            </div>
+
+            <hr>
+
+            Debug output:
+            <textarea id="output" rows="20"></textarea>
+
+            <br>
+
+            <b>Troubleshooting</b>
+
+            <br><br>
+
+            The page does some heavy computations, so make sure:
+
+            <ul>
+                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+            </ul>
+
+            <div class="cell-version">
+                <span>
+                    |
+                    Build time: <span class="nav-link">@GIT_DATE@</span> |
+                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
+                </span>
+            </div>
+        </div>
+
+        <script type="text/javascript" src="helpers.js"></script>
+        <script type='text/javascript'>
+            // web audio context
+            var context = null;
+
+            // audio data
+            var audio = null;
+            var audio0 = null;
+
+            // the command instance
+            var instance = null;
+
+            // model name
+            var model_whisper = null;
+
+            var Module = {
+                print: printTextarea,
+                printErr: printTextarea,
+                setStatus: function(text) {
+                    printTextarea('js: ' + text);
+                },
+                monitorRunDependencies: function(left) {
+                },
+                preRun: function() {
+                    printTextarea('js: Preparing ...');
+                },
+                postRun: function() {
+                    printTextarea('js: Initialized successfully!');
+                }
+            };
+
+            //
+            // fetch models
+            //
+
+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                if (model_whisper != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop' ).disabled = true;
+                }
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'base.en': 142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            //
+            // microphone
+            //
+
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
+
+            var mediaRecorder = null;
+            var doRecording = false;
+            var startTime = 0;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+            function stopRecording() {
+                Module.set_status("paused");
+                doRecording = false;
+                audio0 = null;
+                audio = null;
+                context = null;
+            }
+
+            function startRecording() {
+                if (!context) {
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
+                }
+
+                Module.set_status("");
+
+                document.getElementById('start').disabled = true;
+                document.getElementById('stop').disabled = false;
+
+                doRecording = true;
+                startTime = Date.now();
+
+                var chunks = [];
+                var stream = null;
+
+                navigator.mediaDevices.getUserMedia({audio: true, video: false})
+                    .then(function(s) {
+                        stream = s;
+                        mediaRecorder = new MediaRecorder(stream);
+                        mediaRecorder.ondataavailable = function(e) {
+                            chunks.push(e.data);
+
+                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+                            var reader = new FileReader();
+
+                            reader.onload = function(event) {
+                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
+                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
+                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+                                    var source = offlineContext.createBufferSource();
+                                    source.buffer = audioBuffer;
+                                    source.connect(offlineContext.destination);
+                                    source.start(0);
+
+                                    offlineContext.startRendering().then(function(renderedBuffer) {
+                                        audio = renderedBuffer.getChannelData(0);
+
+                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+                                        if (audio0 != null) {
+                                            audioAll.set(audio0, 0);
+                                        }
+                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+                                        if (instance) {
+                                            Module.set_audio(instance, audioAll);
+                                        }
+                                    });
+                                }, function(e) {
+                                    audio = null;
+                                });
+                            }
+
+                            reader.readAsArrayBuffer(blob);
+                        };
+
+                        mediaRecorder.onstop = function(e) {
+                            if (doRecording) {
+                                setTimeout(function() {
+                                    startRecording();
+                                });
+                            }
+                        };
+
+                        mediaRecorder.start(kIntervalAudio_ms);
+                    })
+                    .catch(function(err) {
+                        printTextarea('js: error getting audio stream: ' + err);
+                    });
+
+                var interval = setInterval(function() {
+                    if (!doRecording) {
+                        clearInterval(interval);
+                        mediaRecorder.stop();
+                        stream.getTracks().forEach(function(track) {
+                            track.stop();
+                        });
+
+                        document.getElementById('start').disabled = false;
+                        document.getElementById('stop').disabled  = true;
+
+                        mediaRecorder = null;
+                    }
+
+                    // if audio length is more than kRestartRecording_s seconds, restart recording
+                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+                        if (doRecording) {
+                            //printTextarea('js: restarting recording');
+
+                            clearInterval(interval);
+                            audio0 = audio;
+                            audio = null;
+                            mediaRecorder.stop();
+                            stream.getTracks().forEach(function(track) {
+                                track.stop();
+                            });
+                        }
+                    }
+                }, 100);
+            }
+
+            //
+            // main
+            //
+
+            var nLines = 0;
+            var intervalUpdate = null;
+            var transcribedAll = '';
+
+            function onStart() {
+                if (!instance) {
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                }
+
+                if (!instance) {
+                    printTextarea("js: failed to initialize whisper");
+                    return;
+                }
+
+                startRecording();
+
+                intervalUpdate = setInterval(function() {
+                    var transcribed = Module.get_transcribed();
+
+                    if (transcribed != null && transcribed.length > 1) {
+                        transcribedAll += transcribed + '<br>';
+                        nLines++;
+
+                        // if more than 10 lines, remove the first line
+                        if (nLines > 10) {
+                            var i = transcribedAll.indexOf('<br>');
+                            if (i > 0) {
+                                transcribedAll = transcribedAll.substring(i + 4);
+                                nLines--;
+                            }
+                        }
+                    }
+
+                    document.getElementById('state-status').innerHTML = Module.get_status();
+                    document.getElementById('state-transcribed').innerHTML = transcribedAll;
+                }, 100);
+            }
+
+            function onStop() {
+                stopRecording();
+            }
+
+        </script>
+        <script type="text/javascript" src="command.js"></script>
+    </body>
+</html>
diff --git a/examples/command/README.md b/examples/command/README.md
index 3ef7368..de8b61c 100644
--- a/examples/command/README.md
+++ b/examples/command/README.md
@@ -13,6 +13,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
 
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
+Web version: https://github.com/ggerganov/whisper.cpp/examples/command.wasm
+
 ## Building
 
 The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
diff --git a/examples/command/command.cpp b/examples/command/command.cpp
index 2e47be0..9cc6dce 100644
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@@ -535,7 +535,7 @@ int main(int argc, char ** argv) {
 
     bool is_running  = true;
     bool have_prompt = false;
-    bool ask_prompt = true;
+    bool ask_prompt  = true;
 
     float prob0 = 0.0f;
     float prob  = 0.0f;
diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html
index cd72b6f..2033d96 100644
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@@ -100,12 +100,6 @@
 
         <script type="text/javascript" src="helpers.js"></script>
         <script type='text/javascript'>
-            const kRestartRecording_s = 15;
-            const kSampleRate = 16000;
-
-            window.AudioContext = window.AudioContext || window.webkitAudioContext;
-            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
-
             // web audio context
             var context = null;
 
@@ -204,10 +198,17 @@
             // microphone
             //
 
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
+
             var mediaRecorder = null;
             var doRecording = false;
             var startTime = 0;
 
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
             function stopRecording() {
                 Module.set_status("paused");
                 doRecording = false;
@@ -219,7 +220,7 @@
             function startRecording() {
                 if (!context) {
                     context = new AudioContext({
-                        sampleRate: 16000,
+                        sampleRate: kSampleRate,
                         channelCount: 1,
                         echoCancellation: false,
                         autoGainControl:  true,
@@ -292,7 +293,7 @@
                             }
                         };
 
-                        mediaRecorder.start(5000);
+                        mediaRecorder.start(kIntervalAudio_ms);
                     })
                     .catch(function(err) {
                         printTextarea('js: error getting audio stream: ' + err);
@@ -326,7 +327,7 @@
                             });
                         }
                     }
-                }, 250);
+                }, 100);
             }
 
             //
diff --git a/examples/talk.wasm/index-tmpl.html b/examples/talk.wasm/index-tmpl.html
index 9b950f1..40c8122 100644
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@@ -160,12 +160,6 @@
 
         <script type="text/javascript" src="helpers.js"></script>
         <script type='text/javascript'>
-            const kRestartRecording_s = 15;
-            const kSampleRate = 16000;
-
-            window.AudioContext = window.AudioContext || window.webkitAudioContext;
-            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
-
             // web audio context
             var context = null;
 
@@ -342,10 +336,17 @@
             // microphone
             //
 
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
+
             var mediaRecorder = null;
             var doRecording = false;
             var startTime = 0;
 
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
             function stopRecording() {
                 Module.set_status("paused");
                 doRecording = false;
@@ -357,7 +358,7 @@
             function startRecording() {
                 if (!context) {
                     context = new AudioContext({
-                        sampleRate: 16000,
+                        sampleRate: kSampleRate,
                         channelCount: 1,
                         echoCancellation: false,
                         autoGainControl:  true,
@@ -431,7 +432,7 @@
                             }
                         };
 
-                        mediaRecorder.start(250);
+                        mediaRecorder.start(kIntervalAudio_ms);
                     })
                     .catch(function(err) {
                         printTextarea('js: error getting audio stream: ' + err);
@@ -466,7 +467,7 @@
                             });
                         }
                     }
-                }, 250);
+                }, 100);
             }
 
             //
diff --git a/examples/whisper.wasm/index-tmpl.html b/examples/whisper.wasm/index-tmpl.html
index f11f11e..7381bb7 100644
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@@ -225,12 +225,6 @@
                 }
             };
 
-            const kMaxAudio_s = 120;
-            const kSampleRate = 16000;
-
-            window.AudioContext = window.AudioContext || window.webkitAudioContext;
-            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
-
             // web audio context
             var context = null;
 
@@ -348,9 +342,21 @@
             // audio file
             //
 
+            const kMaxAudio_s = 120;
+            const kSampleRate = 16000;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
             function loadAudio(event) {
                 if (!context) {
-                    context = new AudioContext({sampleRate: 16000});
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
                 }
 
                 var file = event.target.files[0] || null;
@@ -410,7 +416,13 @@
             // update progress information
             function startRecording() {
                 if (!context) {
-                    context = new AudioContext({sampleRate: 16000});
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
                 }
 
                 document.getElementById('start').disabled = true;
diff --git a/extra/deploy-wasm.sh b/extra/deploy-wasm.sh
new file mode 100755
index 0000000..bd25439
--- /dev/null
+++ b/extra/deploy-wasm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# This is a helper script to deploy all WebAssembly examples to my node
+# Run from the build directory:
+#
+# cd build-em
+# ../extra/deploy-wasm.sh
+#
+
+# check if emcmake is available
+if ! command -v emcmake &> /dev/null
+then
+    echo "Error: emscripten environment is not set up"
+    exit
+fi
+
+emcmake cmake .. && make -j
+if [ $? -ne 0 ]; then
+    echo "Error: build failed"
+    exit
+fi
+
+# copy all wasm files to the node
+scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/         && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
+scp bin/stream.wasm/*  root@linode0:/var/www/html/whisper/stream/  && scp bin/libstream.worker.js  root@linode0:/var/www/html/whisper/stream/
+scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
+scp bin/talk.wasm/*    root@linode0:/var/www/html/whisper/talk/    && scp bin/libtalk.worker.js    root@linode0:/var/www/html/whisper/talk/
+
+echo "Done"
+exit

From e0e864d9cab3c4273b295af49883fa9afb84dc09 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 11:56:55 +0200
Subject: [PATCH 04/18] Update README.md

---
 README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ab0d882..5f02706 100644
--- a/README.md
+++ b/README.md
@@ -452,6 +452,19 @@ in [models](models).
 
 ## Examples
 
-There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
+There are various examples of using the library for different projects in the [examples](examples) folder.
+Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
+
+| Example | Web | Description |
+| ---     | --- | ---         |
+| [examples/main](examples/main) | [examples/whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
+| [examples/bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
+| [examples/stream](examples/stream) | [examples/stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
+| [examples/command](examples/command) | [examples/command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
+| | [examples/talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
+| [examples/whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [examples/whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
+| [examples/generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
+| [examples/livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
 
 ## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)

From a425365b8202bfc8fdfd77f49c96d92b43d3f5d8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 12:53:23 +0200
Subject: [PATCH 05/18] yt-wsp.sh : script to easily transcribe VODs

Thanks to @DaniruKun
ref: https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818

Usage:

  cd whisper.cpp
  make

  ./examples/yt-wsp.sh <video-url>
---
 README.md          |   1 +
 examples/yt-wsp.sh | 132 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100755 examples/yt-wsp.sh

diff --git a/README.md b/README.md
index 5f02706..fd141b8 100644
--- a/README.md
+++ b/README.md
@@ -466,5 +466,6 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [examples/whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
 | [examples/generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
 | [examples/livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
+| [examples/yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
 
 ## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)
diff --git a/examples/yt-wsp.sh b/examples/yt-wsp.sh
new file mode 100755
index 0000000..0e41b1c
--- /dev/null
+++ b/examples/yt-wsp.sh
@@ -0,0 +1,132 @@
+#!/usr/bin/env bash
+
+# Small shell script to more easily automatically download and transcribe live stream VODs.
+# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
+# Use `./transcribe-vod help` to print help info.
+
+# MIT License
+
+# Copyright (c) 2022 Daniils Petrovs
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set -Eeuo pipefail
+
+# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
+MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
+WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
+WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
+
+msg() {
+    echo >&2 -e "${1-}"
+}
+
+cleanup() {
+    msg "Cleaning up..."
+    rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
+}
+
+print_help() {
+    echo "Usage: ./transcribe-vod <video_url>"
+    echo "See configurable env variables in the script"
+    echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
+    echo "Requirements: ffmpeg yt-dlp whisper"
+    echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
+    echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
+}
+
+check_requirements() {
+    if ! command -v ffmpeg &>/dev/null; then
+        echo "ffmpeg is required (https://ffmpeg.org)."
+        exit 1
+    fi
+
+    if ! command -v yt-dlp &>/dev/null; then
+        echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
+        exit 1
+    fi
+
+    if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
+        WHISPER_EXECUTABLE="./main"
+        if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
+            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp)."
+            exit 1
+        fi
+    fi
+}
+
+if [[ $# -lt 1 ]]; then
+    print_help
+    exit 1
+fi
+
+if [[ "$1" == "help" ]]; then
+    print_help
+    exit 0
+fi
+
+temp_dir="tmp"
+source_url="$1"
+
+check_requirements
+
+msg "Downloading VOD..."
+
+# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
+yt-dlp \
+    -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
+    --embed-thumbnail \
+    --embed-chapters \
+    --xattrs \
+    "${source_url}" -o "${temp_dir}/vod.mp4"
+
+msg "Extracting audio and resampling..."
+
+ffmpeg -i "${temp_dir}/vod.mp4" \
+    -hide_banner \
+    -loglevel error \
+    -ar 16000 \
+    -ac 1 \
+    -c:a \
+    pcm_s16le -y "vod-resampled.wav"
+
+msg "Transcribing to subtitle file..."
+msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
+
+$WHISPER_EXECUTABLE \
+    -m "${MODEL_PATH}" \
+    -l "${WHISPER_LANG}" \
+    -f "vod-resampled.wav" \
+    -t 8 \
+    -osrt \
+    --translate
+
+msg "Embedding subtitle track..."
+
+ffmpeg -i "${temp_dir}/vod.mp4" \
+    -hide_banner \
+    -loglevel error \
+    -i "vod-resampled.wav.srt" \
+    -c copy \
+    -c:s mov_text \
+    -y res.mp4
+
+cleanup
+
+msg "Done! Your finished file is ready: res.mp4"

From 67e819baf47f64b40ce02d2675804c1647c6e093 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 13:07:54 +0200
Subject: [PATCH 06/18] minor : remove "examples/" prefix from the README

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index fd141b8..3722d04 100644
--- a/README.md
+++ b/README.md
@@ -457,15 +457,15 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 
 | Example | Web | Description |
 | ---     | --- | ---         |
-| [examples/main](examples/main) | [examples/whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
-| [examples/bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
-| [examples/stream](examples/stream) | [examples/stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
-| [examples/command](examples/command) | [examples/command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
-| | [examples/talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
-| [examples/whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
-| [examples/whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
-| [examples/generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
-| [examples/livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
-| [examples/yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
+| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
+| [bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
+| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
+| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
+| | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
+| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
+| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
+| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
+| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
 
 ## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)

From c207eed431fba089082f33ab98dd1c639a8cd710 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 16:27:04 +0200
Subject: [PATCH 07/18] whisper.objc : fix build warnings

---
 .../whisper.objc.xcodeproj/project.pbxproj          |  2 ++
 examples/whisper.objc/whisper.objc/ViewController.m |  8 ++++----
 whisper.cpp                                         | 13 +++----------
 whisper.h                                           |  6 +++---
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
index bef9ce1..9a9ee54 100644
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@@ -309,6 +309,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
@@ -336,6 +337,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m
index b23354b..4804471 100644
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@@ -21,9 +21,9 @@ void AudioInputCallback(void * inUserData,
 
 @interface ViewController ()
 
-@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
-@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
-@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
 @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
 
 @end
@@ -32,7 +32,7 @@ void AudioInputCallback(void * inUserData,
 
 - (void)setupAudioFormat:(AudioStreamBasicDescription*)format
 {
-    format->mSampleRate       = 16000;
+    format->mSampleRate       = WHISPER_SAMPLE_RATE;
     format->mFormatID         = kAudioFormatLinearPCM;
     format->mFramesPerPacket  = 1;
     format->mChannelsPerFrame = 1;
diff --git a/whisper.cpp b/whisper.cpp
index de97bb7..9e27ab1 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2360,11 +2360,11 @@ whisper_token whisper_token_beg(struct whisper_context * ctx) {
     return ctx->vocab.token_beg;
 }
 
-whisper_token whisper_token_translate() {
+whisper_token whisper_token_translate(void) {
     return whisper_vocab::token_translate;
 }
 
-whisper_token whisper_token_transcribe() {
+whisper_token whisper_token_transcribe(void) {
     return whisper_vocab::token_transcribe;
 }
 
@@ -2921,10 +2921,6 @@ int whisper_full_parallel(
                 model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
                 model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
             }
-
-            const size_t memory_size =
-                ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
-                ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
         }
     }
 
@@ -3044,7 +3040,7 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
     return ctx->result_all[i_segment].tokens[i_token].p;
 }
 
-const char * whisper_print_system_info() {
+const char * whisper_print_system_info(void) {
     static std::string s;
 
     s  = "";
@@ -3145,9 +3141,6 @@ static void whisper_exp_compute_token_level_timestamps(
     const int64_t t0 = segment.t0;
     const int64_t t1 = segment.t1;
 
-    const int s0 = timestamp_to_sample(t0, n_samples);
-    const int s1 = timestamp_to_sample(t1, n_samples);
-
     const int n = tokens.size();
 
     if (n == 0) {
diff --git a/whisper.h b/whisper.h
index 07d422c..b0fb2d9 100644
--- a/whisper.h
+++ b/whisper.h
@@ -162,8 +162,8 @@ extern "C" {
     WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
 
     // Task tokens
-    WHISPER_API whisper_token whisper_token_translate ();
-    WHISPER_API whisper_token whisper_token_transcribe();
+    WHISPER_API whisper_token whisper_token_translate (void);
+    WHISPER_API whisper_token whisper_token_transcribe(void);
 
     // Performance information
     WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
@@ -276,7 +276,7 @@ extern "C" {
     WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
 
     // Print system information
-    WHISPER_API const char * whisper_print_system_info();
+    WHISPER_API const char * whisper_print_system_info(void);
 
 #ifdef __cplusplus
 }

From e266cb072324b74710d084e59b10a1d4bcbada86 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Nov 2022 17:28:28 +0200
Subject: [PATCH 08/18] whisper.objc : add real-time processing (#97)

Similar to the "stream" app
---
 .../whisper.objc/Base.lproj/Main.storyboard   |  19 ++-
 .../whisper.objc/ViewController.h             |   4 +
 .../whisper.objc/ViewController.m             | 127 +++++++++++++-----
 whisper.cpp                                   |  32 ++---
 whisper.h                                     |  70 +++++-----
 5 files changed, 161 insertions(+), 91 deletions(-)

diff --git a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
index 5c92ba8..065ccac 100644
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
     <device id="retina6_0" orientation="portrait" appearance="light"/>
     <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
         <capability name="System colors in document resources" minToolsVersion="11.0"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@@ -40,7 +40,7 @@
                                 <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                 <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                 <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                 <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                             </textView>
                             <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@@ -56,6 +56,18 @@
                                     <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                 </connections>
                             </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+                                <rect key="frame" x="199" y="191" width="156" height="49"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+                                <state key="normal" title="Real-time">
+                                    <color key="titleColor" systemColor="labelColor"/>
+                                </state>
+                                <connections>
+                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+                                </connections>
+                            </button>
                         </subviews>
                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                         <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@@ -64,6 +76,7 @@
                         </constraints>
                     </view>
                     <connections>
+                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                         <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                         <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                         <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
diff --git a/examples/whisper.objc/whisper.objc/ViewController.h b/examples/whisper.objc/whisper.objc/ViewController.h
index 3595518..e32a326 100644
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@@ -20,6 +20,8 @@ typedef struct
 {
     int ggwaveId;
     bool isCapturing;
+    bool isTranscribing;
+    bool isRealtime;
     UILabel * labelReceived;
 
     AudioQueueRef queue;
@@ -31,6 +33,8 @@ typedef struct
     float   * audioBufferF32;
 
     struct whisper_context * ctx;
+
+    void * vc;
 } StateInp;
 
 @interface ViewController : UIViewController
diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m
index 4804471..d294178 100644
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
 @property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
 @property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
 @property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
 @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
 
 @end
@@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
         stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
         stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
     }
+
+    stateInp.isTranscribing = false;
+    stateInp.isRealtime = false;
 }
 
 -(IBAction) stopCapturing {
@@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
     NSLog(@"Start capturing");
 
     stateInp.n_samples = 0;
+    stateInp.vc = (__bridge void *)(self);
 
     OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                          AudioInputCallback,
@@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
 - (IBAction)onTranscribePrepare:(id)sender {
     _textviewResult.text = @"Processing - please wait ...";
 
+    if (stateInp.isRealtime) {
+        [self onRealtime:(id)sender];
+    }
+
     if (stateInp.isCapturing) {
-        // stop capturing
         [self stopCapturing];
+    }
+}
 
-        return;
+- (IBAction)onRealtime:(id)sender {
+    stateInp.isRealtime = !stateInp.isRealtime;
+
+    if (stateInp.isRealtime) {
+        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+    } else {
+        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
     }
+
+    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
 }
 
 - (IBAction)onTranscribe:(id)sender {
+    if (stateInp.isTranscribing) {
+        return;
+    }
+
     NSLog(@"Processing %d samples", stateInp.n_samples);
 
-    // process captured audio
-    // convert I16 to F32
-    for (int i = 0; i < stateInp.n_samples; i++) {
-        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
-    }
+    stateInp.isTranscribing = true;
+
+    // dispatch the model to a background thread
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        // process captured audio
+        // convert I16 to F32
+        for (int i = 0; i < self->stateInp.n_samples; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+        }
 
-    // run the model
-    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+        // run the model
+        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 
-    params.print_realtime   = true;
-    params.print_progress   = false;
-    params.print_timestamps = true;
-    params.print_special    = false;
-    params.translate        = false;
-    params.language         = "en";
-    params.n_threads        = 4;
-    params.offset_ms        = 0;
+        // get maximum number of threads on this device (max 8)
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
 
-    CFTimeInterval startTime = CACurrentMediaTime();
+        params.print_realtime   = true;
+        params.print_progress   = false;
+        params.print_timestamps = true;
+        params.print_special    = false;
+        params.translate        = false;
+        params.language         = "en";
+        params.n_threads        = max_threads;
+        params.offset_ms        = 0;
+        params.single_segment   = self->stateInp.isRealtime;
 
-    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
-        NSLog(@"Failed to run the model");
-        _textviewResult.text = @"Failed to run the model";
+        CFTimeInterval startTime = CACurrentMediaTime();
 
-        return;
-    }
+        whisper_reset_timings(self->stateInp.ctx);
 
-    CFTimeInterval endTime = CACurrentMediaTime();
+        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
+            NSLog(@"Failed to run the model");
+            self->_textviewResult.text = @"Failed to run the model";
 
-    // clear the text in the textview
-    _textviewResult.text = @"";
+            return;
+        }
 
-    int n_segments = whisper_full_n_segments(stateInp.ctx);
-    for (int i = 0; i < n_segments; i++) {
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+        whisper_print_timings(self->stateInp.ctx);
 
-        // append the text to the textview
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
-    }
+        CFTimeInterval endTime = CACurrentMediaTime();
+
+        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
 
-    // internal model timing
-    whisper_print_timings(stateInp.ctx);
+        // result text
+        NSString *result = @"";
 
-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
+        for (int i = 0; i < n_segments; i++) {
+            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
 
-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+            // append the text to the result
+            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        }
+
+        // append processing time
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+
+        // dispatch the result to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            self->_textviewResult.text = result;
+            self->stateInp.isTranscribing = false;
+        });
+    });
 }
 
 //
-// Callback implmentation
+// Callback implementation
 //
 
 void AudioInputCallback(void * inUserData,
@@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
 
     if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
         NSLog(@"Too much audio data, ignoring");
+
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc stopCapturing];
+        });
+
         return;
     }
 
@@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
 
     // put the buffer back in the queue
     AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+    if (stateInp->isRealtime) {
+        // dipatch onTranscribe() to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc onTranscribe:nil];
+        });
+    }
 }
 
 @end
diff --git a/whisper.cpp b/whisper.cpp
index 9e27ab1..2daf411 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
     ctx->t_decode_us = 0;
 }
 
+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+
+    return s.c_str();
+}
+
 ////////////////////////////////////////////////////////////////////////////
 
 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@@ -2863,7 +2878,7 @@ int whisper_full_parallel(
         struct whisper_full_params params,
         const float * samples,
         int n_samples,
-        const int n_processors) {
+        int n_processors) {
     if (n_processors == 1) {
         return whisper_full(ctx, params, samples, n_samples);
     }
@@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
     return ctx->result_all[i_segment].tokens[i_token].p;
 }
 
-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-
-    return s.c_str();
-}
-
 // =================================================================================================
 
 //
diff --git a/whisper.h b/whisper.h
index b0fb2d9..4b5fbcc 100644
--- a/whisper.h
+++ b/whisper.h
@@ -72,16 +72,16 @@ extern "C" {
         whisper_token id;  // token id
         whisper_token tid; // forced timestamp token id
 
-        float p;     // probability of the token
-        float pt;    // probability of the timestamp token
-        float ptsum; // sum of probabilities of all timestamp tokens
+        float p;           // probability of the token
+        float pt;          // probability of the timestamp token
+        float ptsum;       // sum of probabilities of all timestamp tokens
 
         // token-level timestamp data
         // do not use if you haven't computed token-level timestamps
-        int64_t t0; // start time of the token
-        int64_t t1; //   end time of the token
+        int64_t t0;        // start time of the token
+        int64_t t1;        //   end time of the token
 
-        float vlen; // voice length of the token
+        float vlen;        // voice length of the token
     } whisper_token_data;
 
     // Allocates all memory needed for the model and loads the model from the given file.
@@ -96,9 +96,9 @@ extern "C" {
     // Returns 0 on success
     WHISPER_API int whisper_pcm_to_mel(
             struct whisper_context * ctx,
-            const float * samples,
-            int n_samples,
-            int n_threads);
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);
 
     // This can be used to set a custom log mel spectrogram inside the provided whisper context.
     // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@@ -106,9 +106,9 @@ extern "C" {
     // Returns 0 on success
     WHISPER_API int whisper_set_mel(
             struct whisper_context * ctx,
-            const float * data,
-            int n_len,
-            int n_mel);
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);
 
     // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
     // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@@ -116,8 +116,8 @@ extern "C" {
     // Returns 0 on success
     WHISPER_API int whisper_encode(
             struct whisper_context * ctx,
-            int offset,
-            int n_threads);
+                               int   offset,
+                               int   n_threads);
 
     // Run the Whisper decoder to obtain the logits and probabilities for the next token.
     // Make sure to call whisper_encode() first.
@@ -126,10 +126,10 @@ extern "C" {
     // Returns 0 on success
     WHISPER_API int whisper_decode(
             struct whisper_context * ctx,
-            const whisper_token * tokens,
-            int n_tokens,
-            int n_past,
-            int n_threads);
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);
 
     // Token sampling methods.
     // These are provided for convenience and can be used after each call to whisper_decode().
@@ -169,6 +169,9 @@ extern "C" {
     WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
     WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
 
+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);
+
     ////////////////////////////////////////////////////////////////////////////
 
     // Available sampling strategies
@@ -187,12 +190,12 @@ extern "C" {
 
         int n_threads;
         int n_max_text_ctx;
-        int offset_ms;      // start offset in ms
-        int duration_ms;    // audio duration to process in ms
+        int offset_ms;          // start offset in ms
+        int duration_ms;        // audio duration to process in ms
 
         bool translate;
         bool no_context;
-        bool single_segment; // force single segment output (useful for streaming)
+        bool single_segment;    // force single segment output (useful for streaming)
         bool print_special;
         bool print_progress;
         bool print_realtime;
@@ -206,8 +209,8 @@ extern "C" {
         int   max_tokens;       // max tokens per segment (0 = no limit)
 
         // [EXPERIMENTAL] speed-up techniques
-        bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
-        int  audio_ctx; // overwrite the audio context size (0 = use default)
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)
 
         // tokens to provide the whisper model as initial prompt
         // these are prepended to any existing text context from a previous call
@@ -235,20 +238,20 @@ extern "C" {
     // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
     // Uses the specified decoding strategy to obtain the text.
     WHISPER_API int whisper_full(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);
 
     // Split the input audio in chunks and process each chunk separately using whisper_full()
     // It seems this approach can offer some speedup in some cases.
     // However, the transcription accuracy can be worse at the beginning and end of each chunk.
     WHISPER_API int whisper_full_parallel(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples,
-            const int n_processors);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples,
+                                   int   n_processors);
 
     // Number of generated text segments.
     // A segment can be a few words, a sentence, or even a paragraph.
@@ -275,9 +278,6 @@ extern "C" {
     // Get the probability of the specified token in the specified segment.
     WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
 
-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
 #ifdef __cplusplus
 }
 #endif

From 164df0d447bd16a4f6cd0e08caef5b01a63354af Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Nov 2022 10:48:59 +0200
Subject: [PATCH 09/18] whisper.objc : fix context + broken readme links

---
 examples/command.wasm/README.md                     | 2 +-
 examples/command/README.md                          | 2 +-
 examples/whisper.objc/README.md                     | 4 ++++
 examples/whisper.objc/whisper.objc/ViewController.m | 6 +++++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md
index a6e0cf1..d317f3a 100644
--- a/examples/command.wasm/README.md
+++ b/examples/command.wasm/README.md
@@ -5,7 +5,7 @@ It runs in fully in the browser via WebAseembly.
 
 Online demo: https://whisper.ggerganov.com/command/
 
-Terminal version: https://github.com/ggerganov/whisper.cpp/examples/command
+Terminal version: [examples/command](/examples/command)
 
 ## Build instructions
 
diff --git a/examples/command/README.md b/examples/command/README.md
index de8b61c..3a521fb 100644
--- a/examples/command/README.md
+++ b/examples/command/README.md
@@ -13,7 +13,7 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
 
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
-Web version: https://github.com/ggerganov/whisper.cpp/examples/command.wasm
+Web version: [examples/command.wasm](/examples/command.wasm)
 
 ## Building
 
diff --git a/examples/whisper.objc/README.md b/examples/whisper.objc/README.md
index 05c6613..5317282 100644
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@@ -5,6 +5,10 @@ The inference runs locally, on-device.
 
 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
 
+Real-time transcription demo:
+
+https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-bca8-0e46d9da2364.mp4
+
 ## Usage
 
 ```java
diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m
index d294178..d6aef36 100644
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@@ -198,6 +198,7 @@ void AudioInputCallback(void * inUserData,
         params.language         = "en";
         params.n_threads        = max_threads;
         params.offset_ms        = 0;
+        params.no_context       = true;
         params.single_segment   = self->stateInp.isRealtime;
 
         CFTimeInterval startTime = CACurrentMediaTime();
@@ -228,8 +229,11 @@ void AudioInputCallback(void * inUserData,
             result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
         }
 
+        const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
+
         // append processing time
-        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time:  %5.3f s]", tRecording]];
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"  \n[processing time: %5.3f s]", endTime - startTime]];
 
         // dispatch the result to the main thread
         dispatch_async(dispatch_get_main_queue(), ^{

From 6fd5358dd0162bca1409022a14e8fb9b1e41cc91 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Nov 2022 11:30:32 +0200
Subject: [PATCH 10/18] Update README.md

---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3722d04..9c77782 100644
--- a/README.md
+++ b/README.md
@@ -468,4 +468,9 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
 | [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
 
-## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)
+## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
+
+If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
+You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
+to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
+[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.

From 4698dcdb5238748951a087a5b26309c6b2826cc0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Nov 2022 20:28:36 +0200
Subject: [PATCH 11/18] whisper : add mechanism for aborting the whisper_full()
 computation

---
 examples/main/main.cpp | 13 +++++++++++++
 whisper.cpp            | 13 +++++++++++++
 whisper.h              | 11 +++++++++++
 3 files changed, 37 insertions(+)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 569404c..465d43f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -607,6 +607,19 @@ int main(int argc, char ** argv) {
                 wparams.new_segment_callback_user_data = &user_data;
             }
 
+            // example for abort mechanism
+            // in this example, we do not abort the processing, but we could if the flag is set to true
+            // the callback is called before every encoder run - if it returns false, the processing is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return !is_aborted;
+                };
+                wparams.encoder_begin_callback_user_data = &is_aborted;
+            }
+
             if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 10;
diff --git a/whisper.cpp b/whisper.cpp
index 2daf411..fbcb5d1 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2451,6 +2451,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
 
                     /*.new_segment_callback           =*/ nullptr,
                     /*.new_segment_callback_user_data =*/ nullptr,
+
+                    /*.encoder_begin_callback           =*/ nullptr,
+                    /*.encoder_begin_callback_user_data =*/ nullptr,
                 };
             } break;
         case WHISPER_SAMPLING_BEAM_SEARCH:
@@ -2497,6 +2500,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
 
                     /*.new_segment_callback           =*/ nullptr,
                     /*.new_segment_callback_user_data =*/ nullptr,
+
+                    /*.encoder_begin_callback           =*/ nullptr,
+                    /*.encoder_begin_callback_user_data =*/ nullptr,
                 };
             } break;
     }
@@ -2659,6 +2665,13 @@ int whisper_full(
             break;
         }
 
+        if (params.encoder_begin_callback) {
+            if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
+                fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
+                break;
+            }
+        }
+
         // encode audio features starting at offset seek
         if (whisper_encode(ctx, seek, params.n_threads) != 0) {
             fprintf(stderr, "%s: failed to encode\n", __func__);
diff --git a/whisper.h b/whisper.h
index 4b5fbcc..156edbb 100644
--- a/whisper.h
+++ b/whisper.h
@@ -185,6 +185,14 @@ extern "C" {
     // Use the whisper_full_...() functions to obtain the text segments
     typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
 
+    // Encoder begin callback
+    // If not NULL, called before the encoder starts
+    // If it returns false, the computation is aborted
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+
+    // Parameters for the whisper_full() function
+    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
+    // whisper_full_default_params()
     struct whisper_full_params {
         enum whisper_sampling_strategy strategy;
 
@@ -231,6 +239,9 @@ extern "C" {
 
         whisper_new_segment_callback new_segment_callback;
         void * new_segment_callback_user_data;
+
+        whisper_encoder_begin_callback encoder_begin_callback;
+        void * encoder_begin_callback_user_data;
     };
 
     WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

From e7f09a0a61824e2d89afab33d8b764bce78f6271 Mon Sep 17 00:00:00 2001
From: Tienshiao Ma <tienshiao@tienshiao.org>
Date: Mon, 28 Nov 2022 23:29:34 -0800
Subject: [PATCH 12/18] Fix Darwin flags - was incorrectly always using the
 Linux else clause

---
 Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 07c91c1..75fdd73 100644
--- a/Makefile
+++ b/Makefile
@@ -60,8 +60,7 @@ ifeq ($(UNAME_M),x86_64)
 		ifneq (,$(findstring AVX2,$(AVX2_M)))
 			CFLAGS += -mavx2
 		endif
-	endif
-	ifeq ($(UNAME_S),Linux)
+	else ifeq ($(UNAME_S),Linux)
 		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
 		ifneq (,$(findstring avx,$(AVX1_M)))
 			CFLAGS += -mavx

From 093c840deef894cf38729785825cd4cc05e7cec0 Mon Sep 17 00:00:00 2001
From: semiformal-net <84111142+semiformal-net@users.noreply.github.com>
Date: Thu, 1 Dec 2022 12:18:22 -0500
Subject: [PATCH 13/18] livestream : fix losing words across audio chunk (#195)

* improve livestream script

* Update examples/livestream.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

Co-authored-by: Paul Edwards <paul.edwards@semiformal.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/livestream.sh | 46 ++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/livestream.sh b/examples/livestream.sh
index 18893a3..b363448 100755
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+set -eo pipefail
 # Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
 # Idea by @semiformal-net
 # ref: https://github.com/ggerganov/whisper.cpp/issues/185
@@ -10,14 +10,15 @@
 #
 
 url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
-step_ms=10000
+fmt=aac # the audio format extension of the stream (TODO: auto detect)
+step_s=30
 model="base.en"
 
 if [ -z "$1" ]; then
-    echo "Usage: $0 stream_url [step_ms] [model]"
+    echo "Usage: $0 stream_url [step_s] [model]"
     echo ""
     echo "  Example:"
-    echo "    $0 $url $step_ms $model"
+    echo "    $0 $url $step_s $model"
     echo ""
     echo "No url specified, using default: $url"
 else
@@ -25,7 +26,7 @@ else
 fi
 
 if [ -n "$2" ]; then
-    step_ms="$2"
+    step_s="$2"
 fi
 
 if [ -n "$3" ]; then
@@ -54,16 +55,35 @@ fi
 
 running=1
 
-trap "running=0" SIGINT SIGTERM
+#trap "running=0" SIGINT SIGTERM
 
-printf "[+] Transcribing stream with model '$model', step_ms $step_ms (press Ctrl+C to stop):\n\n"
+printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
 
+# continuous stream in native fmt (this file will grow forever!)
+ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt}  &
+if [ $? -ne 0 ]; then
+    printf "Error: ffmpeg failed to capture audio stream\n"
+    exit 1
+fi
+printf "Buffering audio. Please wait...\n"
+# For some reason, the initial buffer can end up smaller than step_s (even though we sleep for step_s)
+sleep $(($step_s*2))
+i=0
 while [ $running -eq 1 ]; do
-    ffmpeg -y -re -probesize 32 -i $url -ar 16000 -ac 1 -c:a pcm_s16le -t ${step_ms}ms /tmp/whisper-live0.wav > /dev/null 2> /tmp/whisper-live.err
-    if [ $? -ne 0 ]; then
-        printf "Error: ffmpeg failed to capture audio stream\n"
-        exit 1
+    # a handy bash built-in, SECONDS,
+    # > "This variable expands to the number of seconds since the shell was started. Assignment to this variable resets the count to the value assigned, and the expanded value becomes the value assigned
+    # > plus the number of seconds since the assignment."
+    SECONDS=0
+    # extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
+    if [ $i -gt 0 ]; then
+        ffmpeg -loglevel quiet -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav
+    else
+        ffmpeg -loglevel quiet -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav
     fi
-    mv /tmp/whisper-live0.wav /tmp/whisper-live.wav
-    ./main -t 8 -m ./models/ggml-small.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1 &
+    ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
+    echo
+    while [ $SECONDS -lt $step_s ]; do
+        sleep 1
+    done
+    ((i=i+1))
 done

From 4f7363077fefd7c30d280b27a8f1894e6c59fb88 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Dec 2022 19:47:58 +0200
Subject: [PATCH 14/18] livestream : minor changes

---
 examples/livestream.sh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/livestream.sh b/examples/livestream.sh
index b363448..df0f83d 100755
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@@ -53,10 +53,6 @@ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
     exit 1
 fi
 
-running=1
-
-#trap "running=0" SIGINT SIGTERM
-
 printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
 
 # continuous stream in native fmt (this file will grow forever!)
@@ -66,10 +62,12 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 printf "Buffering audio. Please wait...\n"
+
 # For some reason, the initial buffer can end up smaller than step_s (even though we sleep for step_s)
 sleep $(($step_s*2))
+
 i=0
-while [ $running -eq 1 ]; do
+while [ true ]; do
     # a handy bash built-in, SECONDS,
     # > "This variable expands to the number of seconds since the shell was started. Assignment to this variable resets the count to the value assigned, and the expanded value becomes the value assigned
     # > plus the number of seconds since the assignment."
@@ -80,8 +78,9 @@ while [ $running -eq 1 ]; do
     else
         ffmpeg -loglevel quiet -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav
     fi
+
     ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
-    echo
+
     while [ $SECONDS -lt $step_s ]; do
         sleep 1
     done

From 57e0e6b7004d53a9b0abd327965e4da312a31408 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Dec 2022 20:49:09 +0200
Subject: [PATCH 15/18] livestream : handle ffmpeg errors gracefully and
 stabilize transcript

---
 examples/livestream.sh | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/livestream.sh b/examples/livestream.sh
index df0f83d..167ce0b 100755
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@@ -53,36 +53,46 @@ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
     exit 1
 fi
 
+running=1
+
+trap "running=0" SIGINT SIGTERM
+
 printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
 
 # continuous stream in native fmt (this file will grow forever!)
-ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt}  &
+ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
 if [ $? -ne 0 ]; then
     printf "Error: ffmpeg failed to capture audio stream\n"
     exit 1
 fi
-printf "Buffering audio. Please wait...\n"
 
-# For some reason, the initial buffer can end up smaller than step_s (even though we sleep for step_s)
-sleep $(($step_s*2))
+printf "Buffering audio. Please wait...\n\n"
+sleep $(($step_s))
+
+# do not stop script on error
+set +e
 
 i=0
-while [ true ]; do
-    # a handy bash built-in, SECONDS,
-    # > "This variable expands to the number of seconds since the shell was started. Assignment to this variable resets the count to the value assigned, and the expanded value becomes the value assigned
-    # > plus the number of seconds since the assignment."
-    SECONDS=0
+SECONDS=0
+while [ $running -eq 1 ]; do
     # extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
-    if [ $i -gt 0 ]; then
-        ffmpeg -loglevel quiet -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav
-    else
-        ffmpeg -loglevel quiet -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav
-    fi
+    err=1
+    while [ $err -ne 0 ]; do
+        if [ $i -gt 0 ]; then
+            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
+        else
+            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
+        fi
+        err=$(cat /tmp/whisper-live.err | wc -l)
+    done
 
     ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
 
-    while [ $SECONDS -lt $step_s ]; do
+    while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
         sleep 1
     done
     ((i=i+1))
 done
+
+killall -v ffmpeg
+killall -v main

From 061fc81bd699cc3f7a66ecb5377cc4cfa24898f2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Dec 2022 22:15:12 +0200
Subject: [PATCH 16/18] ggml : remove inline specifier from fp16 <-> fp32
 converters

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index c28628a..b6d528d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -147,7 +147,7 @@ static inline uint32_t fp32_to_bits(float f) {
 	return fp32.as_bits;
 }
 
-inline float ggml_fp16_to_fp32(ggml_fp16_t h) {
+float ggml_fp16_to_fp32(ggml_fp16_t h) {
     const uint32_t w = (uint32_t) h << 16;
     const uint32_t sign = w & UINT32_C(0x80000000);
     const uint32_t two_w = w + w;
@@ -170,7 +170,7 @@ inline float ggml_fp16_to_fp32(ggml_fp16_t h) {
     return fp32_from_bits(result);
 }
 
-inline ggml_fp16_t ggml_fp32_to_fp16(float f) {
+ggml_fp16_t ggml_fp32_to_fp16(float f) {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
     const float scale_to_inf = 0x1.0p+112f;
     const float scale_to_zero = 0x1.0p-110f;

From 9b7df68753ba9a8dbc5f5193e892a3c1bde20c90 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Nov 2022 22:44:01 +0200
Subject: [PATCH 17/18] tests : adding transcription tests

---
 Makefile           |   8 +++
 tests/.gitignore   |   3 ++
 tests/en-0-ref.txt |   1 +
 tests/en-1-ref.txt |   1 +
 tests/en-2-ref.txt |   1 +
 tests/es-0-ref.txt |   1 +
 tests/run-tests.sh | 125 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 140 insertions(+)
 create mode 100644 tests/.gitignore
 create mode 100644 tests/en-0-ref.txt
 create mode 100644 tests/en-1-ref.txt
 create mode 100644 tests/en-2-ref.txt
 create mode 100644 tests/es-0-ref.txt
 create mode 100755 tests/run-tests.sh

diff --git a/Makefile b/Makefile
index 75fdd73..1224818 100644
--- a/Makefile
+++ b/Makefile
@@ -206,3 +206,11 @@ tiny.en tiny base.en base small.en small medium.en medium large: main
 		./main -m models/ggml-$@.bin -f $$f ; \
 		echo "" ; \
 	done
+
+#
+# Tests
+#
+
+.PHONY: tests
+tests:
+	bash ./tests/run-tests.sh
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000..efe8ed5
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,3 @@
+*.wav
+*.ogg
+*.wav.txt
diff --git a/tests/en-0-ref.txt b/tests/en-0-ref.txt
new file mode 100644
index 0000000..af47434
--- /dev/null
+++ b/tests/en-0-ref.txt
@@ -0,0 +1 @@
+ My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]
\ No newline at end of file
diff --git a/tests/en-1-ref.txt b/tests/en-1-ref.txt
new file mode 100644
index 0000000..b9be452
--- /dev/null
+++ b/tests/en-1-ref.txt
@@ -0,0 +1 @@
+ Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.
\ No newline at end of file
diff --git a/tests/en-2-ref.txt b/tests/en-2-ref.txt
new file mode 100644
index 0000000..deb7fb8
--- /dev/null
+++ b/tests/en-2-ref.txt
@@ -0,0 +1 @@
+ This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.
\ No newline at end of file
diff --git a/tests/es-0-ref.txt b/tests/es-0-ref.txt
new file mode 100644
index 0000000..b670dd9
--- /dev/null
+++ b/tests/es-0-ref.txt
@@ -0,0 +1 @@
+ Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!
\ No newline at end of file
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
new file mode 100755
index 0000000..048dfbd
--- /dev/null
+++ b/tests/run-tests.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# This scripts run the selected model agains a collection of audio files from the web.
+# It downloads, converts and transcribes each file and then compares the result with the expected reference
+# transcription. The comparison is performed using git's diff command and shows the differences at the character level.
+# It can be used to quickly verify that the model is working as expected across a wide range of audio files.
+# I.e. like an integration test. The verification is done by visual inspection of the diff output.
+#
+# The reference data can be for example generated using the original OpenAI Whisper implementation, or entered manually.
+#
+# Feel free to suggest extra audio files to add to the list.
+# Make sure they are between 1-3 minutes long since we don't want to make the test too slow.
+#
+# Usage:
+#
+#   ./tests/run-tests.sh <model_name>
+#
+
+cd `dirname $0`
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ $# -eq 0 ]; then
+    printf "Usage: $0 [model]\n\n"
+    printf "No model specified. Aborting\n"
+    list_models
+    exit 1
+fi
+
+model=$1
+main="../main"
+
+if [ ! -f ../models/ggml-$model.bin ]; then
+    printf "Model $model not found. Aborting\n"
+    list_models
+    exit 1
+fi
+
+if [ ! -f $main ]; then
+    printf "Executable $main not found. Aborting\n"
+    exit 1
+fi
+
+# add various audio files for testing purposes here
+# the order of the files is important so don't change the existing order
+# when adding new files, make sure to add the expected "ref.txt" file with the correct transcript
+urls_en=(
+    "https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg"
+    "https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg"
+    "https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
+)
+
+urls_es=(
+    "https://upload.wikimedia.org/wikipedia/commons/c/c1/La_contaminacion_del_agua.ogg"
+)
+
+urls_it=(
+)
+
+urls_pt=(
+)
+
+urls_de=(
+)
+
+urls_jp=(
+)
+
+urls_ru=(
+)
+
+function run_lang() {
+    lang=$1
+    shift
+    urls=("$@")
+
+    i=0
+    for url in "${urls[@]}"; do
+        echo "- [$lang] Processing '$url' ..."
+
+        ext="${url##*.}"
+        fname_src="$lang-${i}.${ext}"
+        fname_dst="$lang-${i}-16khz.wav"
+
+        if [ ! -f $fname_src ]; then
+            wget --quiet --show-progress -O $fname_src $url
+        fi
+
+        if [ ! -f $fname_dst ]; then
+            ffmpeg -loglevel -0 -y -i $fname_src -ar 16000 -ac 1 -c:a pcm_s16le $fname_dst
+            if [ $? -ne 0 ]; then
+                echo "Error: ffmpeg failed to convert $fname_src to $fname_dst"
+                exit 1
+            fi
+        fi
+
+        $main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
+
+        git diff --no-index --word-diff=color --word-diff-regex=. $fname_dst.txt $lang-$i-ref.txt
+
+        i=$(($i+1))
+    done
+}
+
+run_lang "en" "${urls_en[@]}"
+
+if [[ $model != *.en ]]; then
+    run_lang "es" "${urls_es[@]}"
+    run_lang "it" "${urls_it[@]}"
+    run_lang "pt" "${urls_pt[@]}"
+    run_lang "de" "${urls_de[@]}"
+    run_lang "jp" "${urls_jp[@]}"
+    run_lang "ru" "${urls_ru[@]}"
+fi

From 78d13257be8094a71b65af401d4753281af2205a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Dec 2022 21:51:50 +0200
Subject: [PATCH 18/18] Try to improve the token sampling strategy (#193)

* whisper : try to improve the token sampling strategy

- Add the "max_initial_timestaamp" token logic from OpenAI
- Disallow sampling timestamps that are in the past

* whisper : fix the max initial timestamp logic + fallback decoding
---
 whisper.cpp | 97 +++++++++++++++++++++++++----------------------------
 whisper.h   |  2 +-
 2 files changed, 46 insertions(+), 53 deletions(-)

diff --git a/whisper.cpp b/whisper.cpp
index fbcb5d1..42467ef 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1846,7 +1846,9 @@ static bool whisper_decode(
 // the most basic sampling scheme - select the top token
 static whisper_token_data whisper_sample_best(
         const whisper_vocab & vocab,
-        const float * probs) {
+        const float * probs,
+              bool force_timestamp,
+              bool is_initial) {
     whisper_token_data result = {
         0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
     };
@@ -1869,7 +1871,18 @@ static whisper_token_data whisper_sample_best(
             max_tx = std::max(max_tx, probs_id[i].first);
         }
 
-        for (int i = vocab.token_beg; i < n_logits; i++) {
+        const auto i0 = is_initial ? vocab.token_beg + 101 : vocab.token_beg;
+        const auto i1 = is_initial ? vocab.token_beg + 101 : n_logits;
+
+        // the initial timestamp cannot be larger than 100
+        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
+        if (is_initial) {
+            for (int i = i0; i < n_logits; ++ i) {
+                probs_id[i].first = -INFINITY;
+            }
+        }
+
+        for (int i = vocab.token_beg; i < i1; i++) {
             sum_ts += probs_id[i].first;
             if  (probs_id[i].first > max_ts) {
                 max_ts = probs_id[i].first;
@@ -1879,7 +1892,7 @@ static whisper_token_data whisper_sample_best(
 
         // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
         // timestamp token
-        if (sum_ts > max_tx) {
+        if (sum_ts > max_tx || force_timestamp) {
             // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
             for (int i = 0; i < vocab.token_beg; i++) {
                 probs_id[i].first = -INFINITY;
@@ -1921,39 +1934,6 @@ static whisper_token_data whisper_sample_best(
     return result;
 }
 
-// samples only from the timestamps tokens
-static whisper_vocab::id whisper_sample_timestamp(
-        const whisper_vocab & vocab,
-        const float * probs) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, whisper_vocab::id>> probs_id;
-    probs_id.reserve(n_logits);
-
-    for (int i = vocab.token_beg + 1; i < n_logits; i++) {
-        probs_id.push_back(std::make_pair(probs[i], i));
-    }
-
-    const int top_k = 10;
-
-    // find the top K tokens
-    std::partial_sort(
-            probs_id.begin(),
-            probs_id.begin() + top_k, probs_id.end(),
-            [](const std::pair<double, whisper_vocab::id> & a, const std::pair<double, whisper_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    probs_id.resize(top_k);
-
-    //printf("\n");
-    //for (int i = 0; i < (int) probs_id.size(); i++) {
-    //    printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second);
-    //}
-
-    return probs_id[0].second;
-}
-
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 static std::string to_timestamp(int64_t t, bool comma = false) {
@@ -2284,19 +2264,17 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
 struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) {
     const int64_t t_start_sample_us = ggml_time_us();
 
-    // TODO: simplify
-    auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
+    const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), false, false);
 
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 
     return res;
 }
 
-whisper_token whisper_sample_timestamp(struct whisper_context * ctx) {
+struct whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial) {
     const int64_t t_start_sample_us = ggml_time_us();
 
-    // TODO: simplify
-    auto res = whisper_sample_timestamp(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
+    const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), true, is_initial);
 
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 
@@ -2694,7 +2672,6 @@ int whisper_full(
 
         prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
 
-        bool done = false;
         int seek_delta = 100*WHISPER_CHUNK_SIZE;
 
         // print the prompt
@@ -2708,7 +2685,9 @@ int whisper_full(
         int result_len = 0;
         tokens_cur.clear();
 
-        for (int i = 0; i < whisper_n_text_ctx(ctx)/2 - 4; ++i) {
+        bool failed = false;
+
+        for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
             if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
                 fprintf(stderr, "%s: failed to decode\n", __func__);
                 return 8;
@@ -2725,15 +2704,19 @@ int whisper_full(
             // feel free to experiment!
             //
             {
-                auto token = whisper_sample_best(ctx);
-
-                if (i == 0) {
-                    token.tid = whisper_token_beg(ctx);
-                }
+                const auto token = (i == 0) ? whisper_sample_timestamp(ctx, true) : whisper_sample_best(ctx);
 
                 // timestamp token - update sliding window
                 if (token.id > whisper_token_beg(ctx)) {
-                    seek_delta = 2*(token.id - whisper_token_beg(ctx));
+                    const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
+
+                    // do not allow to go back in time
+                    if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
+                        seek_delta > seek_delta_new && result_len < i) {
+                        break;
+                    }
+
+                    seek_delta = seek_delta_new;
                     result_len = i + 1;
                 }
 
@@ -2752,8 +2735,8 @@ int whisper_full(
                         if (seek + seek_delta + 100 >= seek_end) {
                             result_len = i + 1;
                         } else {
-                            // TODO: figure out how to resolve this
-                            fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__);
+                            failed = true;
+                            break;
                         }
                     }
 
@@ -2772,11 +2755,21 @@ int whisper_full(
                 }
             }
 
-            if (done) {
+            // sometimes, the decoding can get stuck in a repetition loop
+            // this is a simple strategy to avoid such cases - we simply flag the decoding as failed and advance
+            // the sliding window by 1 second
+            if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
+                failed = true;
                 break;
             }
         }
 
+        if (failed) {
+            fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__);
+            seek += 100;
+            continue;
+        }
+
         // shrink down to result_len
         tokens_cur.resize(result_len);
 
diff --git a/whisper.h b/whisper.h
index 156edbb..def77d4 100644
--- a/whisper.h
+++ b/whisper.h
@@ -137,7 +137,7 @@ extern "C" {
     // whisper_sample_best() returns the token with the highest probability
     // whisper_sample_timestamp() returns the most probable timestamp token
     WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
+    WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
 
     // Return the id of the specified language, returns -1 if not found
     WHISPER_API int whisper_lang_id(const char * lang);