From cb70b07db5a5a1ee41aa6ed4859e35908fc2d120 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Nov 2022 10:05:37 +0200 Subject: [PATCH 01/18] livestream.sh : simple tool to transcribe audio livestreams (#185) --- examples/livestream.sh | 69 +++++++++++++++++++++++++++++ examples/stream.wasm/emscripten.cpp | 2 +- examples/talk.wasm/emscripten.cpp | 2 +- 3 files changed, 71 insertions(+), 2 deletions(-) create mode 100755 examples/livestream.sh diff --git a/examples/livestream.sh b/examples/livestream.sh new file mode 100755 index 0000000..18893a3 --- /dev/null +++ b/examples/livestream.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals +# Idea by @semiformal-net +# ref: https://github.com/ggerganov/whisper.cpp/issues/185 +# +# TODO: +# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a +# way to produce a continuous stream of audio chunks. +# + +url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8" +step_ms=10000 +model="base.en" + +if [ -z "$1" ]; then + echo "Usage: $0 stream_url [step_ms] [model]" + echo "" + echo " Example:" + echo " $0 $url $step_ms $model" + echo "" + echo "No url specified, using default: $url" +else + url="$1" +fi + +if [ -n "$2" ]; then + step_ms="$2" +fi + +if [ -n "$3" ]; then + model="$3" +fi + +# Whisper models +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" ) + +# list available models +function list_models { + printf "\n" + printf " Available models:" + for model in "${models[@]}"; do + printf " $model" + done + printf "\n\n" +} + +if [[ ! " ${models[@]} " =~ " ${model} " ]]; then + printf "Invalid model: $model\n" + list_models + + exit 1 +fi + +running=1 + +trap "running=0" SIGINT SIGTERM + +printf "[+] Transcribing stream with model '$model', step_ms $step_ms (press Ctrl+C to stop):\n\n" + +while [ $running -eq 1 ]; do + ffmpeg -y -re -probesize 32 -i $url -ar 16000 -ac 1 -c:a pcm_s16le -t ${step_ms}ms /tmp/whisper-live0.wav > /dev/null 2> /tmp/whisper-live.err + if [ $? -ne 0 ]; then + printf "Error: ffmpeg failed to capture audio stream\n" + exit 1 + fi + mv /tmp/whisper-live0.wav /tmp/whisper-live.wav + ./main -t 8 -m ./models/ggml-small.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1 & +done diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp index f8e3e27..b75eee3 100644 --- a/examples/stream.wasm/emscripten.cpp +++ b/examples/stream.wasm/emscripten.cpp @@ -51,7 +51,7 @@ void stream_main(size_t index) { wparams.language = "en"; - printf("stream: using %d threads\n", N_THREAD); + printf("stream: using %d threads\n", wparams.n_threads); std::vector pcmf32; diff --git a/examples/talk.wasm/emscripten.cpp b/examples/talk.wasm/emscripten.cpp index 501c459..c82f469 100644 --- a/examples/talk.wasm/emscripten.cpp +++ b/examples/talk.wasm/emscripten.cpp @@ -68,7 +68,7 @@ void talk_main(size_t index) { g_gpt2 = gpt2_init("gpt-2.bin"); - printf("talk: using %d threads\n", N_THREAD); + printf("talk: using %d threads\n", wparams.n_threads); std::vector pcmf32; From c536ff40051502d5692cf3467d40add40a2ca45f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Nov 2022 10:22:42 +0200 Subject: [PATCH 02/18] minor : add comment for using "generate_karaoke.sh" --- examples/generate-karaoke.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/generate-karaoke.sh b/examples/generate-karaoke.sh index c5e7c44..7062c9a 100755 --- a/examples/generate-karaoke.sh +++ b/examples/generate-karaoke.sh @@ -1,5 +1,16 @@ #!/bin/bash +# Simple tool to record audio from the microphone and generate a karaoke video +# Usage: +# +# cd whisper.cpp +# make +# +# ./examples/generate-karaoke.sh [model] [step_ms] +# +# Press Ctrl+C to stop recording +# + executable="./main" model="base.en" model_path="models/ggml-$model.bin" From 68ecadbbc9ff7a969545983f8fa3ca91d4f6e614 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Nov 2022 11:40:06 +0200 Subject: [PATCH 03/18] command.wasm : add voice assistant example for the Web (#171) Same as the command-line tool "command", but runs in the browser Also, added helper script "extra/deploy-wasm.sh" and fixed some timing constants for the WASM examples. --- README.md | 2 +- examples/CMakeLists.txt | 1 + examples/command.wasm/CMakeLists.txt | 47 +++ examples/command.wasm/README.md | 23 ++ examples/command.wasm/emscripten.cpp | 408 ++++++++++++++++++++++++++ examples/command.wasm/index-tmpl.html | 386 ++++++++++++++++++++++++ examples/command/README.md | 2 + examples/command/command.cpp | 2 +- examples/stream.wasm/index-tmpl.html | 19 +- examples/talk.wasm/index-tmpl.html | 19 +- examples/whisper.wasm/index-tmpl.html | 28 +- extra/deploy-wasm.sh | 30 ++ 12 files changed, 939 insertions(+), 28 deletions(-) create mode 100644 examples/command.wasm/CMakeLists.txt create mode 100644 examples/command.wasm/README.md create mode 100644 examples/command.wasm/emscripten.cpp create mode 100644 examples/command.wasm/index-tmpl.html create mode 100755 extra/deploy-wasm.sh diff --git a/README.md b/README.md index 5c22979..ab0d882 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ As an example, here is a video of running the model on an iPhone 13 device - ful https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4 -You can also easily make your own offline voice assistant application: +You can also easily make your own offline voice assistant application: [command](examples/command) https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e798d1f..b03694e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) add_subdirectory(whisper.wasm) add_subdirectory(stream.wasm) + add_subdirectory(command.wasm) add_subdirectory(talk.wasm) else() add_subdirectory(main) diff --git a/examples/command.wasm/CMakeLists.txt b/examples/command.wasm/CMakeLists.txt new file mode 100644 index 0000000..27fd0ab --- /dev/null +++ b/examples/command.wasm/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# libcommand +# + +set(TARGET libcommand) + +add_executable(${TARGET} + emscripten.cpp + ) + +target_link_libraries(${TARGET} PRIVATE + whisper + ) + +unset(EXTRA_FLAGS) + +if (WHISPER_WASM_SINGLE_FILE) + set(EXTRA_FLAGS "-s SINGLE_FILE=1") + message(STATUS "Embedding WASM inside command.js") + + add_custom_command( + TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_BINARY_DIR}/bin/libcommand.js + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js + ) +endif() + +set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ + --bind \ + -s USE_PTHREADS=1 \ + -s PTHREAD_POOL_SIZE=8 \ + -s INITIAL_MEMORY=1024MB \ + -s TOTAL_MEMORY=1024MB \ + -s FORCE_FILESYSTEM=1 \ + -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ + ${EXTRA_FLAGS} \ + ") + +# +# command.wasm +# + +set(TARGET command.wasm) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY) diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md new file mode 100644 index 0000000..a6e0cf1 --- /dev/null +++ b/examples/command.wasm/README.md @@ -0,0 +1,23 @@ +# command.wasm + +This is a basic Voice Assistant example that accepts voice commands from the microphone. +It runs in fully in the browser via WebAseembly. + +Online demo: https://whisper.ggerganov.com/command/ + +Terminal version: https://github.com/ggerganov/whisper.cpp/examples/command + +## Build instructions + +```bash +# build using Emscripten (v3.1.2) +git clone https://github.com/ggerganov/whisper.cpp +cd whisper.cpp +mkdir build-em && cd build-em +emcmake cmake .. +make -j + +# copy the produced page to your HTTP path +cp bin/command.wasm/* /path/to/html/ +cp bin/libcommand.worker.js /path/to/html/ +``` diff --git a/examples/command.wasm/emscripten.cpp b/examples/command.wasm/emscripten.cpp new file mode 100644 index 0000000..d4bbb21 --- /dev/null +++ b/examples/command.wasm/emscripten.cpp @@ -0,0 +1,408 @@ +#include "ggml.h" +#include "whisper.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +constexpr int N_THREAD = 8; + +std::vector g_contexts(4, nullptr); + +std::mutex g_mutex; +std::thread g_worker; + +std::atomic g_running(false); + +std::string g_status = ""; +std::string g_status_forced = ""; +std::string g_transcribed = ""; + +std::vector g_pcmf32; + +static std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +static void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +// compute similarity between two strings using Levenshtein distance +static float similarity(const std::string & s0, const std::string & s1) { + const size_t len0 = s0.size() + 1; + const size_t len1 = s1.size() + 1; + + std::vector col(len1, 0); + std::vector prevCol(len1, 0); + + for (size_t i = 0; i < len1; i++) { + prevCol[i] = i; + } + + for (size_t i = 0; i < len0; i++) { + col[0] = i; + for (size_t j = 1; j < len1; j++) { + col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1)); + } + col.swap(prevCol); + } + + const float dist = prevCol[len1 - 1]; + + return 1.0f - (dist / std::max(s0.size(), s1.size())); +} + +void command_set_status(const std::string & status) { + std::lock_guard lock(g_mutex); + g_status = status; +} + +bool command_vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (size_t i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} + +std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector & pcmf32, float & prob, int64_t & t_ms) { + const auto t_start = std::chrono::high_resolution_clock::now(); + + prob = 0.0f; + t_ms = 0; + + if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + return ""; + } + + int prob_n = 0; + std::string result; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + + result += text; + + const int n_tokens = whisper_full_n_tokens(ctx, i); + for (int j = 0; j < n_tokens; ++j) { + const auto token = whisper_full_get_token_data(ctx, i, j); + + prob += token.p; + ++prob_n; + } + } + + if (prob_n > 0) { + prob /= prob_n; + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + t_ms = std::chrono::duration_cast(t_end - t_start).count(); + + return result; +} + +void command_get_audio(int ms, int sample_rate, std::vector & audio) { + const int64_t n_samples = (ms * sample_rate) / 1000; + + int64_t n_take = 0; + if (g_pcmf32.size() < n_samples) { + n_take = g_pcmf32.size(); + } else { + n_take = n_samples; + } + + audio.resize(n_take); + std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin()); +} + +void command_main(size_t index) { + command_set_status("loading data ..."); + + struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); + + wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); + wparams.offset_ms = 0; + wparams.translate = false; + wparams.no_context = true; + wparams.single_segment = true; + wparams.print_realtime = false; + wparams.print_progress = false; + wparams.print_timestamps = true; + wparams.print_special = false; + + wparams.max_tokens = 32; + wparams.audio_ctx = 768; // partial encoder context for better performance + + wparams.language = "en"; + + printf("command: using %d threads\n", wparams.n_threads); + + bool is_running = true; + bool have_prompt = false; + bool ask_prompt = true; + bool print_energy = false; + + float prob0 = 0.0f; + float prob = 0.0f; + + std::vector pcmf32_cur; + std::vector pcmf32_prompt; + + const std::string k_prompt = "Ok Whisper, start listening for commands."; + + // whisper context + auto & ctx = g_contexts[index]; + + const int32_t vad_ms = 2000; + const int32_t prompt_ms = 5000; + const int32_t command_ms = 4000; + + const float vad_thold = 0.1f; + const float freq_thold = -1.0f; + + while (g_running) { + // delay + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + if (ask_prompt) { + fprintf(stdout, "\n"); + fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m"); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str()); + command_set_status(txt); + } + + ask_prompt = false; + } + + int64_t t_ms = 0; + + { + command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) { + fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); + command_set_status("Speech detected! Processing ..."); + + if (!have_prompt) { + command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms)); + + fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms); + + const float sim = similarity(txt, k_prompt); + + if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) { + fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__); + ask_prompt = true; + } else { + fprintf(stdout, "\n"); + fprintf(stdout, "%s: The prompt has been recognized!\n", __func__); + fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ..."); + command_set_status(txt); + } + + // save the audio for the prompt + pcmf32_prompt = pcmf32_cur; + have_prompt = true; + } + } else { + command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + // prepend the prompt audio + pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end()); + + const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms)); + + prob = 100.0f*(prob - prob0); + + fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str()); + + // find the prompt in the text + float best_sim = 0.0f; + size_t best_len = 0; + for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) { + const auto prompt = txt.substr(0, n); + + const float sim = similarity(prompt, k_prompt); + + //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim); + + if (sim > best_sim) { + best_sim = sim; + best_len = n; + } + } + + const std::string command = ::trim(txt.substr(best_len)); + + fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms); + command_set_status(txt); + } + { + std::lock_guard lock(g_mutex); + g_transcribed = command; + } + } + + g_pcmf32.clear(); + } + } + } + + if (index < g_contexts.size()) { + whisper_free(g_contexts[index]); + g_contexts[index] = nullptr; + } +} + +EMSCRIPTEN_BINDINGS(command) { + emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { + for (size_t i = 0; i < g_contexts.size(); ++i) { + if (g_contexts[i] == nullptr) { + g_contexts[i] = whisper_init(path_model.c_str()); + if (g_contexts[i] != nullptr) { + g_running = true; + if (g_worker.joinable()) { + g_worker.join(); + } + g_worker = std::thread([i]() { + command_main(i); + }); + + return i + 1; + } else { + return (size_t) 0; + } + } + } + + return (size_t) 0; + })); + + emscripten::function("free", emscripten::optional_override([](size_t index) { + if (g_running) { + g_running = false; + } + })); + + emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) { + --index; + + if (index >= g_contexts.size()) { + return -1; + } + + if (g_contexts[index] == nullptr) { + return -2; + } + + { + std::lock_guard lock(g_mutex); + const int n = audio["length"].as(); + + emscripten::val heap = emscripten::val::module_property("HEAPU8"); + emscripten::val memory = heap["buffer"]; + + g_pcmf32.resize(n); + + emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast(g_pcmf32.data()), n); + memoryView.call("set", audio); + } + + return 0; + })); + + emscripten::function("get_transcribed", emscripten::optional_override([]() { + std::string transcribed; + + { + std::lock_guard lock(g_mutex); + transcribed = std::move(g_transcribed); + } + + return transcribed; + })); + + emscripten::function("get_status", emscripten::optional_override([]() { + std::string status; + + { + std::lock_guard lock(g_mutex); + status = g_status_forced.empty() ? g_status : g_status_forced; + } + + return status; + })); + + emscripten::function("set_status", emscripten::optional_override([](const std::string & status) { + { + std::lock_guard lock(g_mutex); + g_status_forced = status; + } + })); +} diff --git a/examples/command.wasm/index-tmpl.html b/examples/command.wasm/index-tmpl.html new file mode 100644 index 0000000..08670a1 --- /dev/null +++ b/examples/command.wasm/index-tmpl.html @@ -0,0 +1,386 @@ + + + + command : Voice assistant example using Whisper + WebAssembly + + + + +
+ command : Voice assistant example using Whisper + WebAssembly + +

+ + You can find more about this project on GitHub. + +

+ +
+ + Select the model you would like to use, click the "Start" button and follow the instructions. + +

+ +
+ Whisper model: + + + + + +
+ +
+ +
+ + + +
+ +
+ +
+ Status: not started + +
[The recognized voice commands will be displayed here]
+
+ +
+ + Debug output: + + +
+ + Troubleshooting + +

+ + The page does some heavy computations, so make sure: + +
    +
  • To use a modern web browser (e.g. Chrome, Firefox)
  • +
  • To use a fast desktop or laptop computer (i.e. not a mobile phone)
  • +
  • Your browser supports WASM Fixed-width SIMD
  • +
+ +
+ + | + Build time: @GIT_DATE@ | + Commit hash: @GIT_SHA1@ | + Commit subject: @GIT_COMMIT_SUBJECT@ | + Source Code | + +
+
+ + + + + + diff --git a/examples/command/README.md b/examples/command/README.md index 3ef7368..de8b61c 100644 --- a/examples/command/README.md +++ b/examples/command/README.md @@ -13,6 +13,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/ https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4 +Web version: https://github.com/ggerganov/whisper.cpp/examples/command.wasm + ## Building The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 2e47be0..9cc6dce 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -535,7 +535,7 @@ int main(int argc, char ** argv) { bool is_running = true; bool have_prompt = false; - bool ask_prompt = true; + bool ask_prompt = true; float prob0 = 0.0f; float prob = 0.0f; diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html index cd72b6f..2033d96 100644 --- a/examples/stream.wasm/index-tmpl.html +++ b/examples/stream.wasm/index-tmpl.html @@ -100,12 +100,6 @@