commit
e4bd5d8ccc
@ -0,0 +1,47 @@
|
|||||||
|
#
|
||||||
|
# libcommand
|
||||||
|
#
|
||||||
|
|
||||||
|
set(TARGET libcommand)
|
||||||
|
|
||||||
|
add_executable(${TARGET}
|
||||||
|
emscripten.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(${TARGET} PRIVATE
|
||||||
|
whisper
|
||||||
|
)
|
||||||
|
|
||||||
|
unset(EXTRA_FLAGS)
|
||||||
|
|
||||||
|
if (WHISPER_WASM_SINGLE_FILE)
|
||||||
|
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||||
|
message(STATUS "Embedding WASM inside command.js")
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
TARGET ${TARGET} POST_BUILD
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
|
${CMAKE_BINARY_DIR}/bin/libcommand.js
|
||||||
|
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||||
|
--bind \
|
||||||
|
-s USE_PTHREADS=1 \
|
||||||
|
-s PTHREAD_POOL_SIZE=8 \
|
||||||
|
-s INITIAL_MEMORY=1024MB \
|
||||||
|
-s TOTAL_MEMORY=1024MB \
|
||||||
|
-s FORCE_FILESYSTEM=1 \
|
||||||
|
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||||
|
${EXTRA_FLAGS} \
|
||||||
|
")
|
||||||
|
|
||||||
|
#
|
||||||
|
# command.wasm
|
||||||
|
#
|
||||||
|
|
||||||
|
set(TARGET command.wasm)
|
||||||
|
|
||||||
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
||||||
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
@ -0,0 +1,23 @@
|
|||||||
|
# command.wasm
|
||||||
|
|
||||||
|
This is a basic Voice Assistant example that accepts voice commands from the microphone.
|
||||||
|
It runs in fully in the browser via WebAseembly.
|
||||||
|
|
||||||
|
Online demo: https://whisper.ggerganov.com/command/
|
||||||
|
|
||||||
|
Terminal version: [examples/command](/examples/command)
|
||||||
|
|
||||||
|
## Build instructions
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# build using Emscripten (v3.1.2)
|
||||||
|
git clone https://github.com/ggerganov/whisper.cpp
|
||||||
|
cd whisper.cpp
|
||||||
|
mkdir build-em && cd build-em
|
||||||
|
emcmake cmake ..
|
||||||
|
make -j
|
||||||
|
|
||||||
|
# copy the produced page to your HTTP path
|
||||||
|
cp bin/command.wasm/* /path/to/html/
|
||||||
|
cp bin/libcommand.worker.js /path/to/html/
|
||||||
|
```
|
@ -0,0 +1,408 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "whisper.h"
|
||||||
|
|
||||||
|
#include <emscripten.h>
|
||||||
|
#include <emscripten/bind.h>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <cmath>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
|
constexpr int N_THREAD = 8;
|
||||||
|
|
||||||
|
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||||
|
|
||||||
|
std::mutex g_mutex;
|
||||||
|
std::thread g_worker;
|
||||||
|
|
||||||
|
std::atomic<bool> g_running(false);
|
||||||
|
|
||||||
|
std::string g_status = "";
|
||||||
|
std::string g_status_forced = "";
|
||||||
|
std::string g_transcribed = "";
|
||||||
|
|
||||||
|
std::vector<float> g_pcmf32;
|
||||||
|
|
||||||
|
static std::string trim(const std::string & s) {
|
||||||
|
std::regex e("^\\s+|\\s+$");
|
||||||
|
return std::regex_replace(s, e, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||||
|
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||||
|
const float dt = 1.0f / sample_rate;
|
||||||
|
const float alpha = dt / (rc + dt);
|
||||||
|
|
||||||
|
float y = data[0];
|
||||||
|
|
||||||
|
for (size_t i = 1; i < data.size(); i++) {
|
||||||
|
y = alpha * (y + data[i] - data[i - 1]);
|
||||||
|
data[i] = y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute similarity between two strings using Levenshtein distance
|
||||||
|
static float similarity(const std::string & s0, const std::string & s1) {
|
||||||
|
const size_t len0 = s0.size() + 1;
|
||||||
|
const size_t len1 = s1.size() + 1;
|
||||||
|
|
||||||
|
std::vector<int> col(len1, 0);
|
||||||
|
std::vector<int> prevCol(len1, 0);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < len1; i++) {
|
||||||
|
prevCol[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < len0; i++) {
|
||||||
|
col[0] = i;
|
||||||
|
for (size_t j = 1; j < len1; j++) {
|
||||||
|
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
|
||||||
|
}
|
||||||
|
col.swap(prevCol);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float dist = prevCol[len1 - 1];
|
||||||
|
|
||||||
|
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
void command_set_status(const std::string & status) {
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
g_status = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||||
|
const int n_samples = pcmf32.size();
|
||||||
|
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||||
|
|
||||||
|
if (n_samples_last >= n_samples) {
|
||||||
|
// not enough samples - assume no speech
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (freq_thold > 0.0f) {
|
||||||
|
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
float energy_all = 0.0f;
|
||||||
|
float energy_last = 0.0f;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_samples; i++) {
|
||||||
|
energy_all += fabsf(pcmf32[i]);
|
||||||
|
|
||||||
|
if (i >= n_samples - n_samples_last) {
|
||||||
|
energy_last += fabsf(pcmf32[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
energy_all /= n_samples;
|
||||||
|
energy_last /= n_samples_last;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (energy_last > vad_thold*energy_all) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
|
||||||
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
prob = 0.0f;
|
||||||
|
t_ms = 0;
|
||||||
|
|
||||||
|
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
int prob_n = 0;
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
const int n_segments = whisper_full_n_segments(ctx);
|
||||||
|
for (int i = 0; i < n_segments; ++i) {
|
||||||
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||||
|
|
||||||
|
result += text;
|
||||||
|
|
||||||
|
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||||
|
for (int j = 0; j < n_tokens; ++j) {
|
||||||
|
const auto token = whisper_full_get_token_data(ctx, i, j);
|
||||||
|
|
||||||
|
prob += token.p;
|
||||||
|
++prob_n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prob_n > 0) {
|
||||||
|
prob /= prob_n;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
|
||||||
|
const int64_t n_samples = (ms * sample_rate) / 1000;
|
||||||
|
|
||||||
|
int64_t n_take = 0;
|
||||||
|
if (g_pcmf32.size() < n_samples) {
|
||||||
|
n_take = g_pcmf32.size();
|
||||||
|
} else {
|
||||||
|
n_take = n_samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
audio.resize(n_take);
|
||||||
|
std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
void command_main(size_t index) {
|
||||||
|
command_set_status("loading data ...");
|
||||||
|
|
||||||
|
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||||
|
|
||||||
|
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||||
|
wparams.offset_ms = 0;
|
||||||
|
wparams.translate = false;
|
||||||
|
wparams.no_context = true;
|
||||||
|
wparams.single_segment = true;
|
||||||
|
wparams.print_realtime = false;
|
||||||
|
wparams.print_progress = false;
|
||||||
|
wparams.print_timestamps = true;
|
||||||
|
wparams.print_special = false;
|
||||||
|
|
||||||
|
wparams.max_tokens = 32;
|
||||||
|
wparams.audio_ctx = 768; // partial encoder context for better performance
|
||||||
|
|
||||||
|
wparams.language = "en";
|
||||||
|
|
||||||
|
printf("command: using %d threads\n", wparams.n_threads);
|
||||||
|
|
||||||
|
bool is_running = true;
|
||||||
|
bool have_prompt = false;
|
||||||
|
bool ask_prompt = true;
|
||||||
|
bool print_energy = false;
|
||||||
|
|
||||||
|
float prob0 = 0.0f;
|
||||||
|
float prob = 0.0f;
|
||||||
|
|
||||||
|
std::vector<float> pcmf32_cur;
|
||||||
|
std::vector<float> pcmf32_prompt;
|
||||||
|
|
||||||
|
const std::string k_prompt = "Ok Whisper, start listening for commands.";
|
||||||
|
|
||||||
|
// whisper context
|
||||||
|
auto & ctx = g_contexts[index];
|
||||||
|
|
||||||
|
const int32_t vad_ms = 2000;
|
||||||
|
const int32_t prompt_ms = 5000;
|
||||||
|
const int32_t command_ms = 4000;
|
||||||
|
|
||||||
|
const float vad_thold = 0.1f;
|
||||||
|
const float freq_thold = -1.0f;
|
||||||
|
|
||||||
|
while (g_running) {
|
||||||
|
// delay
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
|
||||||
|
if (ask_prompt) {
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
|
||||||
|
{
|
||||||
|
char txt[1024];
|
||||||
|
snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
|
||||||
|
command_set_status(txt);
|
||||||
|
}
|
||||||
|
|
||||||
|
ask_prompt = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t t_ms = 0;
|
||||||
|
|
||||||
|
{
|
||||||
|
command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||||
|
|
||||||
|
if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
|
||||||
|
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||||
|
command_set_status("Speech detected! Processing ...");
|
||||||
|
|
||||||
|
if (!have_prompt) {
|
||||||
|
command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||||
|
|
||||||
|
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
|
||||||
|
|
||||||
|
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
|
||||||
|
|
||||||
|
const float sim = similarity(txt, k_prompt);
|
||||||
|
|
||||||
|
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
|
||||||
|
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
|
||||||
|
ask_prompt = true;
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
|
||||||
|
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
|
||||||
|
{
|
||||||
|
char txt[1024];
|
||||||
|
snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
|
||||||
|
command_set_status(txt);
|
||||||
|
}
|
||||||
|
|
||||||
|
// save the audio for the prompt
|
||||||
|
pcmf32_prompt = pcmf32_cur;
|
||||||
|
have_prompt = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||||
|
|
||||||
|
// prepend the prompt audio
|
||||||
|
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
|
||||||
|
|
||||||
|
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
|
||||||
|
|
||||||
|
prob = 100.0f*(prob - prob0);
|
||||||
|
|
||||||
|
fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
|
||||||
|
|
||||||
|
// find the prompt in the text
|
||||||
|
float best_sim = 0.0f;
|
||||||
|
size_t best_len = 0;
|
||||||
|
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
|
||||||
|
const auto prompt = txt.substr(0, n);
|
||||||
|
|
||||||
|
const float sim = similarity(prompt, k_prompt);
|
||||||
|
|
||||||
|
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
|
||||||
|
|
||||||
|
if (sim > best_sim) {
|
||||||
|
best_sim = sim;
|
||||||
|
best_len = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string command = ::trim(txt.substr(best_len));
|
||||||
|
|
||||||
|
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
|
||||||
|
{
|
||||||
|
char txt[1024];
|
||||||
|
snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
|
||||||
|
command_set_status(txt);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
g_transcribed = command;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
g_pcmf32.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index < g_contexts.size()) {
|
||||||
|
whisper_free(g_contexts[index]);
|
||||||
|
g_contexts[index] = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EMSCRIPTEN_BINDINGS(command) {
|
||||||
|
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||||
|
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||||
|
if (g_contexts[i] == nullptr) {
|
||||||
|
g_contexts[i] = whisper_init(path_model.c_str());
|
||||||
|
if (g_contexts[i] != nullptr) {
|
||||||
|
g_running = true;
|
||||||
|
if (g_worker.joinable()) {
|
||||||
|
g_worker.join();
|
||||||
|
}
|
||||||
|
g_worker = std::thread([i]() {
|
||||||
|
command_main(i);
|
||||||
|
});
|
||||||
|
|
||||||
|
return i + 1;
|
||||||
|
} else {
|
||||||
|
return (size_t) 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (size_t) 0;
|
||||||
|
}));
|
||||||
|
|
||||||
|
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||||
|
if (g_running) {
|
||||||
|
g_running = false;
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
|
||||||
|
--index;
|
||||||
|
|
||||||
|
if (index >= g_contexts.size()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_contexts[index] == nullptr) {
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
const int n = audio["length"].as<int>();
|
||||||
|
|
||||||
|
emscripten::val heap = emscripten::val::module_property("HEAPU8");
|
||||||
|
emscripten::val memory = heap["buffer"];
|
||||||
|
|
||||||
|
g_pcmf32.resize(n);
|
||||||
|
|
||||||
|
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
|
||||||
|
memoryView.call<void>("set", audio);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}));
|
||||||
|
|
||||||
|
emscripten::function("get_transcribed", emscripten::optional_override([]() {
|
||||||
|
std::string transcribed;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
transcribed = std::move(g_transcribed);
|
||||||
|
}
|
||||||
|
|
||||||
|
return transcribed;
|
||||||
|
}));
|
||||||
|
|
||||||
|
emscripten::function("get_status", emscripten::optional_override([]() {
|
||||||
|
std::string status;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
status = g_status_forced.empty() ? g_status : g_status_forced;
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}));
|
||||||
|
|
||||||
|
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(g_mutex);
|
||||||
|
g_status_forced = status;
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
@ -0,0 +1,386 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en-us">
|
||||||
|
<head>
|
||||||
|
<title>command : Voice assistant example using Whisper + WebAssembly</title>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
#output {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
margin: 0 auto;
|
||||||
|
margin-top: 10px;
|
||||||
|
border-left: 0px;
|
||||||
|
border-right: 0px;
|
||||||
|
padding-left: 0px;
|
||||||
|
padding-right: 0px;
|
||||||
|
display: block;
|
||||||
|
background-color: black;
|
||||||
|
color: white;
|
||||||
|
font-size: 10px;
|
||||||
|
font-family: 'Lucida Console', Monaco, monospace;
|
||||||
|
outline: none;
|
||||||
|
white-space: pre;
|
||||||
|
overflow-wrap: normal;
|
||||||
|
overflow-x: scroll;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="main-container">
|
||||||
|
<b>command : Voice assistant example using Whisper + WebAssembly</b>
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
Select the model you would like to use, click the "Start" button and follow the instructions.
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
<div id="model-whisper">
|
||||||
|
Whisper model: <span id="model-whisper-status"></span>
|
||||||
|
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||||
|
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||||
|
<span id="fetch-whisper-progress"></span>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||||
|
-->
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
<div id="input">
|
||||||
|
<button id="start" onclick="onStart()" disabled>Start</button>
|
||||||
|
<button id="stop" onclick="onStop()" disabled>Stop</button>
|
||||||
|
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
<div id="state">
|
||||||
|
Status: <b><span id="state-status">not started</span></b>
|
||||||
|
|
||||||
|
<pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
Debug output:
|
||||||
|
<textarea id="output" rows="20"></textarea>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
<b>Troubleshooting</b>
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
The page does some heavy computations, so make sure:
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
||||||
|
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
||||||
|
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<div class="cell-version">
|
||||||
|
<span>
|
||||||
|
|
|
||||||
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
||||||
|
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
||||||
|
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
||||||
|
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script type="text/javascript" src="helpers.js"></script>
|
||||||
|
<script type='text/javascript'>
|
||||||
|
// web audio context
|
||||||
|
var context = null;
|
||||||
|
|
||||||
|
// audio data
|
||||||
|
var audio = null;
|
||||||
|
var audio0 = null;
|
||||||
|
|
||||||
|
// the command instance
|
||||||
|
var instance = null;
|
||||||
|
|
||||||
|
// model name
|
||||||
|
var model_whisper = null;
|
||||||
|
|
||||||
|
var Module = {
|
||||||
|
print: printTextarea,
|
||||||
|
printErr: printTextarea,
|
||||||
|
setStatus: function(text) {
|
||||||
|
printTextarea('js: ' + text);
|
||||||
|
},
|
||||||
|
monitorRunDependencies: function(left) {
|
||||||
|
},
|
||||||
|
preRun: function() {
|
||||||
|
printTextarea('js: Preparing ...');
|
||||||
|
},
|
||||||
|
postRun: function() {
|
||||||
|
printTextarea('js: Initialized successfully!');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// fetch models
|
||||||
|
//
|
||||||
|
|
||||||
|
let dbVersion = 1
|
||||||
|
let dbName = 'whisper.ggerganov.com';
|
||||||
|
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
||||||
|
|
||||||
|
function storeFS(fname, buf) {
|
||||||
|
// write to WASM file using FS_createDataFile
|
||||||
|
// if the file exists, delete it
|
||||||
|
try {
|
||||||
|
Module.FS_unlink(fname);
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
Module.FS_createDataFile("/", fname, buf, true, true);
|
||||||
|
|
||||||
|
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
||||||
|
|
||||||
|
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
||||||
|
|
||||||
|
if (model_whisper != null) {
|
||||||
|
document.getElementById('start').disabled = false;
|
||||||
|
document.getElementById('stop' ).disabled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadWhisper(model) {
|
||||||
|
let urls = {
|
||||||
|
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||||
|
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||||
|
};
|
||||||
|
|
||||||
|
let sizes = {
|
||||||
|
'tiny.en': 75,
|
||||||
|
'base.en': 142,
|
||||||
|
};
|
||||||
|
|
||||||
|
let url = urls[model];
|
||||||
|
let dst = 'whisper.bin';
|
||||||
|
let size_mb = sizes[model];
|
||||||
|
|
||||||
|
model_whisper = model;
|
||||||
|
|
||||||
|
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||||
|
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||||
|
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
||||||
|
|
||||||
|
cbProgress = function(p) {
|
||||||
|
let el = document.getElementById('fetch-whisper-progress');
|
||||||
|
el.innerHTML = Math.round(100*p) + '%';
|
||||||
|
};
|
||||||
|
|
||||||
|
cbCancel = function() {
|
||||||
|
var el;
|
||||||
|
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||||
|
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||||
|
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
||||||
|
};
|
||||||
|
|
||||||
|
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// microphone
|
||||||
|
//
|
||||||
|
|
||||||
|
const kSampleRate = 16000;
|
||||||
|
const kRestartRecording_s = 120;
|
||||||
|
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
|
||||||
|
|
||||||
|
var mediaRecorder = null;
|
||||||
|
var doRecording = false;
|
||||||
|
var startTime = 0;
|
||||||
|
|
||||||
|
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
||||||
|
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
||||||
|
|
||||||
|
function stopRecording() {
|
||||||
|
Module.set_status("paused");
|
||||||
|
doRecording = false;
|
||||||
|
audio0 = null;
|
||||||
|
audio = null;
|
||||||
|
context = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function startRecording() {
|
||||||
|
if (!context) {
|
||||||
|
context = new AudioContext({
|
||||||
|
sampleRate: kSampleRate,
|
||||||
|
channelCount: 1,
|
||||||
|
echoCancellation: false,
|
||||||
|
autoGainControl: true,
|
||||||
|
noiseSuppression: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Module.set_status("");
|
||||||
|
|
||||||
|
document.getElementById('start').disabled = true;
|
||||||
|
document.getElementById('stop').disabled = false;
|
||||||
|
|
||||||
|
doRecording = true;
|
||||||
|
startTime = Date.now();
|
||||||
|
|
||||||
|
var chunks = [];
|
||||||
|
var stream = null;
|
||||||
|
|
||||||
|
navigator.mediaDevices.getUserMedia({audio: true, video: false})
|
||||||
|
.then(function(s) {
|
||||||
|
stream = s;
|
||||||
|
mediaRecorder = new MediaRecorder(stream);
|
||||||
|
mediaRecorder.ondataavailable = function(e) {
|
||||||
|
chunks.push(e.data);
|
||||||
|
|
||||||
|
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
|
||||||
|
var reader = new FileReader();
|
||||||
|
|
||||||
|
reader.onload = function(event) {
|
||||||
|
var buf = new Uint8Array(reader.result);
|
||||||
|
|
||||||
|
if (!context) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
context.decodeAudioData(buf.buffer, function(audioBuffer) {
|
||||||
|
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
|
||||||
|
var source = offlineContext.createBufferSource();
|
||||||
|
source.buffer = audioBuffer;
|
||||||
|
source.connect(offlineContext.destination);
|
||||||
|
source.start(0);
|
||||||
|
|
||||||
|
offlineContext.startRendering().then(function(renderedBuffer) {
|
||||||
|
audio = renderedBuffer.getChannelData(0);
|
||||||
|
|
||||||
|
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
|
||||||
|
|
||||||
|
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
|
||||||
|
if (audio0 != null) {
|
||||||
|
audioAll.set(audio0, 0);
|
||||||
|
}
|
||||||
|
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
|
||||||
|
|
||||||
|
if (instance) {
|
||||||
|
Module.set_audio(instance, audioAll);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}, function(e) {
|
||||||
|
audio = null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
};
|
||||||
|
|
||||||
|
mediaRecorder.onstop = function(e) {
|
||||||
|
if (doRecording) {
|
||||||
|
setTimeout(function() {
|
||||||
|
startRecording();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
mediaRecorder.start(kIntervalAudio_ms);
|
||||||
|
})
|
||||||
|
.catch(function(err) {
|
||||||
|
printTextarea('js: error getting audio stream: ' + err);
|
||||||
|
});
|
||||||
|
|
||||||
|
var interval = setInterval(function() {
|
||||||
|
if (!doRecording) {
|
||||||
|
clearInterval(interval);
|
||||||
|
mediaRecorder.stop();
|
||||||
|
stream.getTracks().forEach(function(track) {
|
||||||
|
track.stop();
|
||||||
|
});
|
||||||
|
|
||||||
|
document.getElementById('start').disabled = false;
|
||||||
|
document.getElementById('stop').disabled = true;
|
||||||
|
|
||||||
|
mediaRecorder = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if audio length is more than kRestartRecording_s seconds, restart recording
|
||||||
|
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
|
||||||
|
if (doRecording) {
|
||||||
|
//printTextarea('js: restarting recording');
|
||||||
|
|
||||||
|
clearInterval(interval);
|
||||||
|
audio0 = audio;
|
||||||
|
audio = null;
|
||||||
|
mediaRecorder.stop();
|
||||||
|
stream.getTracks().forEach(function(track) {
|
||||||
|
track.stop();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// main
|
||||||
|
//
|
||||||
|
|
||||||
|
var nLines = 0;
|
||||||
|
var intervalUpdate = null;
|
||||||
|
var transcribedAll = '';
|
||||||
|
|
||||||
|
function onStart() {
|
||||||
|
if (!instance) {
|
||||||
|
instance = Module.init('whisper.bin');
|
||||||
|
|
||||||
|
if (instance) {
|
||||||
|
printTextarea("js: whisper initialized, instance: " + instance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!instance) {
|
||||||
|
printTextarea("js: failed to initialize whisper");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
startRecording();
|
||||||
|
|
||||||
|
intervalUpdate = setInterval(function() {
|
||||||
|
var transcribed = Module.get_transcribed();
|
||||||
|
|
||||||
|
if (transcribed != null && transcribed.length > 1) {
|
||||||
|
transcribedAll += transcribed + '<br>';
|
||||||
|
nLines++;
|
||||||
|
|
||||||
|
// if more than 10 lines, remove the first line
|
||||||
|
if (nLines > 10) {
|
||||||
|
var i = transcribedAll.indexOf('<br>');
|
||||||
|
if (i > 0) {
|
||||||
|
transcribedAll = transcribedAll.substring(i + 4);
|
||||||
|
nLines--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.getElementById('state-status').innerHTML = Module.get_status();
|
||||||
|
document.getElementById('state-transcribed').innerHTML = transcribedAll;
|
||||||
|
}, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
function onStop() {
|
||||||
|
stopRecording();
|
||||||
|
}
|
||||||
|
|
||||||
|
</script>
|
||||||
|
<script type="text/javascript" src="command.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
|
||||||
|
# Idea by @semiformal-net
|
||||||
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
|
||||||
|
#
|
||||||
|
# TODO:
|
||||||
|
# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
|
||||||
|
# way to produce a continuous stream of audio chunks.
|
||||||
|
#
|
||||||
|
|
||||||
|
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
|
||||||
|
fmt=aac # the audio format extension of the stream (TODO: auto detect)
|
||||||
|
step_s=30
|
||||||
|
model="base.en"
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "Usage: $0 stream_url [step_s] [model]"
|
||||||
|
echo ""
|
||||||
|
echo " Example:"
|
||||||
|
echo " $0 $url $step_s $model"
|
||||||
|
echo ""
|
||||||
|
echo "No url specified, using default: $url"
|
||||||
|
else
|
||||||
|
url="$1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$2" ]; then
|
||||||
|
step_s="$2"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$3" ]; then
|
||||||
|
model="$3"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Whisper models
|
||||||
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
|
||||||
|
|
||||||
|
# list available models
|
||||||
|
function list_models {
|
||||||
|
printf "\n"
|
||||||
|
printf " Available models:"
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
printf " $model"
|
||||||
|
done
|
||||||
|
printf "\n\n"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
||||||
|
printf "Invalid model: $model\n"
|
||||||
|
list_models
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
running=1
|
||||||
|
|
||||||
|
trap "running=0" SIGINT SIGTERM
|
||||||
|
|
||||||
|
printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
|
||||||
|
|
||||||
|
# continuous stream in native fmt (this file will grow forever!)
|
||||||
|
ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
printf "Error: ffmpeg failed to capture audio stream\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf "Buffering audio. Please wait...\n\n"
|
||||||
|
sleep $(($step_s))
|
||||||
|
|
||||||
|
# do not stop script on error
|
||||||
|
set +e
|
||||||
|
|
||||||
|
i=0
|
||||||
|
SECONDS=0
|
||||||
|
while [ $running -eq 1 ]; do
|
||||||
|
# extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
|
||||||
|
err=1
|
||||||
|
while [ $err -ne 0 ]; do
|
||||||
|
if [ $i -gt 0 ]; then
|
||||||
|
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||||
|
else
|
||||||
|
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||||
|
fi
|
||||||
|
err=$(cat /tmp/whisper-live.err | wc -l)
|
||||||
|
done
|
||||||
|
|
||||||
|
./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
|
||||||
|
|
||||||
|
while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
((i=i+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
killall -v ffmpeg
|
||||||
|
killall -v main
|
@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Small shell script to more easily automatically download and transcribe live stream VODs.
|
||||||
|
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
|
||||||
|
# Use `./transcribe-vod help` to print help info.
|
||||||
|
|
||||||
|
# MIT License
|
||||||
|
|
||||||
|
# Copyright (c) 2022 Daniils Petrovs
|
||||||
|
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
# of this software and associated documentation files (the "Software"), to deal
|
||||||
|
# in the Software without restriction, including without limitation the rights
|
||||||
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
# copies of the Software, and to permit persons to whom the Software is
|
||||||
|
# furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
# The above copyright notice and this permission notice shall be included in all
|
||||||
|
# copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
# SOFTWARE.
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
|
||||||
|
MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
|
||||||
|
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
|
||||||
|
WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
|
||||||
|
|
||||||
|
msg() {
|
||||||
|
echo >&2 -e "${1-}"
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
msg "Cleaning up..."
|
||||||
|
rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_help() {
|
||||||
|
echo "Usage: ./transcribe-vod <video_url>"
|
||||||
|
echo "See configurable env variables in the script"
|
||||||
|
echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
|
||||||
|
echo "Requirements: ffmpeg yt-dlp whisper"
|
||||||
|
echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
|
||||||
|
echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_requirements() {
|
||||||
|
if ! command -v ffmpeg &>/dev/null; then
|
||||||
|
echo "ffmpeg is required (https://ffmpeg.org)."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v yt-dlp &>/dev/null; then
|
||||||
|
echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
|
||||||
|
WHISPER_EXECUTABLE="./main"
|
||||||
|
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
|
||||||
|
echo "Whisper is required (https://github.com/ggerganov/whisper.cpp)."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ $# -lt 1 ]]; then
|
||||||
|
print_help
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$1" == "help" ]]; then
|
||||||
|
print_help
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
temp_dir="tmp"
|
||||||
|
source_url="$1"
|
||||||
|
|
||||||
|
check_requirements
|
||||||
|
|
||||||
|
msg "Downloading VOD..."
|
||||||
|
|
||||||
|
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
|
||||||
|
yt-dlp \
|
||||||
|
-f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
|
||||||
|
--embed-thumbnail \
|
||||||
|
--embed-chapters \
|
||||||
|
--xattrs \
|
||||||
|
"${source_url}" -o "${temp_dir}/vod.mp4"
|
||||||
|
|
||||||
|
msg "Extracting audio and resampling..."
|
||||||
|
|
||||||
|
ffmpeg -i "${temp_dir}/vod.mp4" \
|
||||||
|
-hide_banner \
|
||||||
|
-loglevel error \
|
||||||
|
-ar 16000 \
|
||||||
|
-ac 1 \
|
||||||
|
-c:a \
|
||||||
|
pcm_s16le -y "vod-resampled.wav"
|
||||||
|
|
||||||
|
msg "Transcribing to subtitle file..."
|
||||||
|
msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
|
||||||
|
|
||||||
|
$WHISPER_EXECUTABLE \
|
||||||
|
-m "${MODEL_PATH}" \
|
||||||
|
-l "${WHISPER_LANG}" \
|
||||||
|
-f "vod-resampled.wav" \
|
||||||
|
-t 8 \
|
||||||
|
-osrt \
|
||||||
|
--translate
|
||||||
|
|
||||||
|
msg "Embedding subtitle track..."
|
||||||
|
|
||||||
|
ffmpeg -i "${temp_dir}/vod.mp4" \
|
||||||
|
-hide_banner \
|
||||||
|
-loglevel error \
|
||||||
|
-i "vod-resampled.wav.srt" \
|
||||||
|
-c copy \
|
||||||
|
-c:s mov_text \
|
||||||
|
-y res.mp4
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
|
||||||
|
msg "Done! Your finished file is ready: res.mp4"
|
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# This is a helper script to deploy all WebAssembly examples to my node
|
||||||
|
# Run from the build directory:
|
||||||
|
#
|
||||||
|
# cd build-em
|
||||||
|
# ../extra/deploy-wasm.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
# check if emcmake is available
|
||||||
|
if ! command -v emcmake &> /dev/null
|
||||||
|
then
|
||||||
|
echo "Error: emscripten environment is not set up"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
emcmake cmake .. && make -j
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Error: build failed"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
# copy all wasm files to the node
|
||||||
|
scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/ && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
|
||||||
|
scp bin/stream.wasm/* root@linode0:/var/www/html/whisper/stream/ && scp bin/libstream.worker.js root@linode0:/var/www/html/whisper/stream/
|
||||||
|
scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
|
||||||
|
scp bin/talk.wasm/* root@linode0:/var/www/html/whisper/talk/ && scp bin/libtalk.worker.js root@linode0:/var/www/html/whisper/talk/
|
||||||
|
|
||||||
|
echo "Done"
|
||||||
|
exit
|
@ -0,0 +1,3 @@
|
|||||||
|
*.wav
|
||||||
|
*.ogg
|
||||||
|
*.wav.txt
|
@ -0,0 +1 @@
|
|||||||
|
My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]
|
@ -0,0 +1 @@
|
|||||||
|
Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.
|
@ -0,0 +1 @@
|
|||||||
|
This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.
|
@ -0,0 +1 @@
|
|||||||
|
Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!
|
@ -0,0 +1,125 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This scripts run the selected model agains a collection of audio files from the web.
|
||||||
|
# It downloads, converts and transcribes each file and then compares the result with the expected reference
|
||||||
|
# transcription. The comparison is performed using git's diff command and shows the differences at the character level.
|
||||||
|
# It can be used to quickly verify that the model is working as expected across a wide range of audio files.
|
||||||
|
# I.e. like an integration test. The verification is done by visual inspection of the diff output.
|
||||||
|
#
|
||||||
|
# The reference data can be for example generated using the original OpenAI Whisper implementation, or entered manually.
|
||||||
|
#
|
||||||
|
# Feel free to suggest extra audio files to add to the list.
|
||||||
|
# Make sure they are between 1-3 minutes long since we don't want to make the test too slow.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
#
|
||||||
|
# ./tests/run-tests.sh <model_name>
|
||||||
|
#
|
||||||
|
|
||||||
|
cd `dirname $0`
|
||||||
|
|
||||||
|
# Whisper models
|
||||||
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
|
||||||
|
|
||||||
|
# list available models
|
||||||
|
function list_models {
|
||||||
|
printf "\n"
|
||||||
|
printf " Available models:"
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
printf " $model"
|
||||||
|
done
|
||||||
|
printf "\n\n"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ $# -eq 0 ]; then
|
||||||
|
printf "Usage: $0 [model]\n\n"
|
||||||
|
printf "No model specified. Aborting\n"
|
||||||
|
list_models
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
model=$1
|
||||||
|
main="../main"
|
||||||
|
|
||||||
|
if [ ! -f ../models/ggml-$model.bin ]; then
|
||||||
|
printf "Model $model not found. Aborting\n"
|
||||||
|
list_models
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $main ]; then
|
||||||
|
printf "Executable $main not found. Aborting\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# add various audio files for testing purposes here
|
||||||
|
# the order of the files is important so don't change the existing order
|
||||||
|
# when adding new files, make sure to add the expected "ref.txt" file with the correct transcript
|
||||||
|
urls_en=(
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg"
|
||||||
|
"https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg"
|
||||||
|
"https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_es=(
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/c/c1/La_contaminacion_del_agua.ogg"
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_it=(
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_pt=(
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_de=(
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_jp=(
|
||||||
|
)
|
||||||
|
|
||||||
|
urls_ru=(
|
||||||
|
)
|
||||||
|
|
||||||
|
function run_lang() {
|
||||||
|
lang=$1
|
||||||
|
shift
|
||||||
|
urls=("$@")
|
||||||
|
|
||||||
|
i=0
|
||||||
|
for url in "${urls[@]}"; do
|
||||||
|
echo "- [$lang] Processing '$url' ..."
|
||||||
|
|
||||||
|
ext="${url##*.}"
|
||||||
|
fname_src="$lang-${i}.${ext}"
|
||||||
|
fname_dst="$lang-${i}-16khz.wav"
|
||||||
|
|
||||||
|
if [ ! -f $fname_src ]; then
|
||||||
|
wget --quiet --show-progress -O $fname_src $url
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $fname_dst ]; then
|
||||||
|
ffmpeg -loglevel -0 -y -i $fname_src -ar 16000 -ac 1 -c:a pcm_s16le $fname_dst
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Error: ffmpeg failed to convert $fname_src to $fname_dst"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
$main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
|
||||||
|
|
||||||
|
git diff --no-index --word-diff=color --word-diff-regex=. $fname_dst.txt $lang-$i-ref.txt
|
||||||
|
|
||||||
|
i=$(($i+1))
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
run_lang "en" "${urls_en[@]}"
|
||||||
|
|
||||||
|
if [[ $model != *.en ]]; then
|
||||||
|
run_lang "es" "${urls_es[@]}"
|
||||||
|
run_lang "it" "${urls_it[@]}"
|
||||||
|
run_lang "pt" "${urls_pt[@]}"
|
||||||
|
run_lang "de" "${urls_de[@]}"
|
||||||
|
run_lang "jp" "${urls_jp[@]}"
|
||||||
|
run_lang "ru" "${urls_ru[@]}"
|
||||||
|
fi
|
Loading…
Reference in new issue