diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 7d7e534..e798d1f 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,6 +20,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) add_subdirectory(whisper.wasm) + add_subdirectory(stream.wasm) add_subdirectory(talk.wasm) else() add_subdirectory(main) diff --git a/examples/helpers.js b/examples/helpers.js index a16a5e3..071b747 100644 --- a/examples/helpers.js +++ b/examples/helpers.js @@ -19,6 +19,12 @@ var printTextarea = (function() { }; })(); +async function clearCache() { + if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) { + indexedDB.deleteDatabase(dbName); + } +} + // fetch a remote file from remote URL using the Fetch API async function fetchRemote(url, cbProgress, cbPrint) { cbPrint('fetchRemote: downloading with fetch()...'); diff --git a/examples/stream.wasm/CMakeLists.txt b/examples/stream.wasm/CMakeLists.txt new file mode 100644 index 0000000..a69b4cb --- /dev/null +++ b/examples/stream.wasm/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# libstream +# + +set(TARGET libstream) + +add_executable(${TARGET} + emscripten.cpp + ) + +target_link_libraries(${TARGET} PRIVATE + whisper + ) + +unset(EXTRA_FLAGS) + +if (WHISPER_WASM_SINGLE_FILE) + set(EXTRA_FLAGS "-s SINGLE_FILE=1") + message(STATUS "Embedding WASM inside stream.js") + + add_custom_command( + TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_BINARY_DIR}/bin/libstream.js + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js + ) +endif() + +set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ + --bind \ + -s USE_PTHREADS=1 \ + -s PTHREAD_POOL_SIZE=8 \ + -s INITIAL_MEMORY=1024MB \ + -s TOTAL_MEMORY=1024MB \ + -s FORCE_FILESYSTEM=1 \ + -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ + ${EXTRA_FLAGS} \ + ") + +# +# stream.wasm +# + +set(TARGET stream.wasm) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY) diff --git a/examples/stream.wasm/README.md b/examples/stream.wasm/README.md new file mode 100644 index 0000000..593ae37 --- /dev/null +++ b/examples/stream.wasm/README.md @@ -0,0 +1,20 @@ +# stream.wasm + +Real-time transcription in the browser using WebAssembly + +Online demo: https://whisper.ggerganov.com/stream/ + +## Build instructions + +```bash +# build using Emscripten (v3.1.2) +git clone https://github.com/ggerganov/whisper.cpp +cd whisper.cpp +mkdir build-em && cd build-em +emcmake cmake .. +make -j + +# copy the produced page to your HTTP path +cp bin/stream.wasm/* /path/to/html/ +cp bin/libstream.worker.js /path/to/html/ +``` diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp new file mode 100644 index 0000000..f8e3e27 --- /dev/null +++ b/examples/stream.wasm/emscripten.cpp @@ -0,0 +1,213 @@ +#include "ggml.h" +#include "whisper.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +constexpr int N_THREAD = 8; + +std::vector g_contexts(4, nullptr); + +std::mutex g_mutex; +std::thread g_worker; + +std::atomic g_running(false); + +std::string g_status = ""; +std::string g_status_forced = ""; +std::string g_transcribed = ""; + +std::vector g_pcmf32; + +void stream_set_status(const std::string & status) { + std::lock_guard lock(g_mutex); + g_status = status; +} + +void stream_main(size_t index) { + stream_set_status("loading data ..."); + + struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); + + wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); + wparams.offset_ms = 0; + wparams.translate = false; + wparams.no_context = true; + wparams.single_segment = true; + wparams.print_realtime = false; + wparams.print_progress = false; + wparams.print_timestamps = true; + wparams.print_special = false; + + wparams.max_tokens = 32; + wparams.audio_ctx = 768; // partial encoder context for better performance + + wparams.language = "en"; + + printf("stream: using %d threads\n", N_THREAD); + + std::vector pcmf32; + + // whisper context + auto & ctx = g_contexts[index]; + + // 5 seconds interval + const int64_t window_samples = 5*WHISPER_SAMPLE_RATE; + + while (g_running) { + stream_set_status("waiting for audio ..."); + + { + std::unique_lock lock(g_mutex); + + if (g_pcmf32.size() < 1024) { + lock.unlock(); + + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + continue; + } + + pcmf32 = std::vector(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end()); + g_pcmf32.clear(); + } + + { + const auto t_start = std::chrono::high_resolution_clock::now(); + + stream_set_status("running whisper ..."); + + int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()); + if (ret != 0) { + printf("whisper_full() failed: %d\n", ret); + break; + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration(t_end - t_start).count()); + } + + { + std::string text_heard; + + { + const int n_segments = whisper_full_n_segments(ctx); + for (int i = n_segments - 1; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + + printf("transcribed: %s\n", text); + + text_heard += text; + } + } + + { + std::lock_guard lock(g_mutex); + g_transcribed = text_heard; + } + } + } + + if (index < g_contexts.size()) { + whisper_free(g_contexts[index]); + g_contexts[index] = nullptr; + } +} + +EMSCRIPTEN_BINDINGS(stream) { + emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { + for (size_t i = 0; i < g_contexts.size(); ++i) { + if (g_contexts[i] == nullptr) { + g_contexts[i] = whisper_init(path_model.c_str()); + if (g_contexts[i] != nullptr) { + g_running = true; + if (g_worker.joinable()) { + g_worker.join(); + } + g_worker = std::thread([i]() { + stream_main(i); + }); + + return i + 1; + } else { + return (size_t) 0; + } + } + } + + return (size_t) 0; + })); + + emscripten::function("free", emscripten::optional_override([](size_t index) { + if (g_running) { + g_running = false; + } + })); + + emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) { + --index; + + if (index >= g_contexts.size()) { + return -1; + } + + if (g_contexts[index] == nullptr) { + return -2; + } + + { + std::lock_guard lock(g_mutex); + const int n = audio["length"].as(); + + emscripten::val heap = emscripten::val::module_property("HEAPU8"); + emscripten::val memory = heap["buffer"]; + + g_pcmf32.resize(n); + + emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast(g_pcmf32.data()), n); + memoryView.call("set", audio); + } + + return 0; + })); + + emscripten::function("get_transcribed", emscripten::optional_override([]() { + std::string transcribed; + + { + std::lock_guard lock(g_mutex); + transcribed = std::move(g_transcribed); + } + + return transcribed; + })); + + emscripten::function("get_status", emscripten::optional_override([]() { + std::string status; + + { + std::lock_guard lock(g_mutex); + status = g_status_forced.empty() ? g_status : g_status_forced; + } + + return status; + })); + + emscripten::function("set_status", emscripten::optional_override([](const std::string & status) { + { + std::lock_guard lock(g_mutex); + g_status_forced = status; + } + })); +} diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html new file mode 100644 index 0000000..cd72b6f --- /dev/null +++ b/examples/stream.wasm/index-tmpl.html @@ -0,0 +1,385 @@ + + + + stream : Real-time Whisper transcription in WebAssembly + + + + +
+ stream : Real-time Whisper transcription in WebAssembly + +

+ + You can find more about this project on GitHub. + +

+ +
+ + Select the model you would like to use, click the "Start" button and start speaking + +

+ +
+ Whisper model: + + + + + +
+ +
+ +
+ + + +
+ +
+ +
+ Status: not started + +
[The transcribed text will be displayed here]
+
+ +
+ + Debug output: + + +
+ + Troubleshooting + +

+ + The page does some heavy computations, so make sure: + +
    +
  • To use a modern web browser (e.g. Chrome, Firefox)
  • +
  • To use a fast desktop or laptop computer (i.e. not a mobile phone)
  • +
  • Your browser supports WASM Fixed-width SIMD
  • +
+ +
+ + | + Build time: @GIT_DATE@ | + Commit hash: @GIT_SHA1@ | + Commit subject: @GIT_COMMIT_SUBJECT@ | + Source Code | + +
+
+ + + + + + diff --git a/examples/stream/README.md b/examples/stream/README.md index 8b4c8a6..1ca7ee2 100644 --- a/examples/stream/README.md +++ b/examples/stream/README.md @@ -21,3 +21,7 @@ brew install sdl2 make stream ``` + +## Web version + +This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm) diff --git a/examples/talk.wasm/emscripten.cpp b/examples/talk.wasm/emscripten.cpp index 7cc2b0e..501c459 100644 --- a/examples/talk.wasm/emscripten.cpp +++ b/examples/talk.wasm/emscripten.cpp @@ -61,10 +61,10 @@ void talk_main(size_t index) { wparams.print_timestamps = true; wparams.print_special = false; - wparams.max_tokens = 32; - wparams.audio_ctx = 768; // partial encoder context for better performance + wparams.max_tokens = 32; + wparams.audio_ctx = 768; // partial encoder context for better performance - wparams.language = "en"; + wparams.language = "en"; g_gpt2 = gpt2_init("gpt-2.bin"); diff --git a/examples/talk.wasm/index-tmpl.html b/examples/talk.wasm/index-tmpl.html index 8588e77..9b950f1 100644 --- a/examples/talk.wasm/index-tmpl.html +++ b/examples/talk.wasm/index-tmpl.html @@ -504,12 +504,6 @@ Module.force_speak(instance); } - async function clearCache() { - if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) { - indexedDB.deleteDatabase(dbName); - } - } - // // main //