diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 01006c1..39fb6fe 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -14,6 +14,17 @@ if (WHISPER_SUPPORT_SDL2) message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}") endif() +# common + +set(TARGET common) + +add_library(${TARGET} STATIC + common.h + common.cpp + ) + +include(DefaultTargetOptions) + # examples include_directories(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 57c3514..2ef895f 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -1,15 +1,13 @@ -#include +#include "napi.h" +#include "common.h" + +#include "whisper.h" + #include #include #include #include - -#include "napi.h" - -#define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" - -#include "whisper.h" +#include struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); @@ -44,7 +42,7 @@ struct whisper_params { std::string model = "../../ggml-large.bin"; std::vector fname_inp = {}; - std::vector fname_outp = {}; + std::vector fname_out = {}; }; struct whisper_print_user_data { @@ -143,7 +141,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi } int run(whisper_params ¶ms, std::vector> &result) { - if (params.fname_inp.empty()) { fprintf(stderr, "error: no input files specified\n"); return 2; @@ -181,91 +178,14 @@ int run(whisper_params ¶ms, std::vector> &result) { for (int f = 0; f < (int) params.fname_inp.size(); ++f) { const auto fname_inp = params.fname_inp[f]; - const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f]; + const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; std::vector pcmf32; // mono-channel F32 PCM std::vector> pcmf32s; // stereo-channel F32 PCM - // WAV input - { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname_inp == "-") { - { - uint8_t buf[1024]; - while (true) - { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return 4; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); - } - else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str()); - return 5; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str()); - return 6; - } - - if (params.diarize && wav.channels != 2 && params.no_timestamps == false) { - fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str()); - return 6; - } - - if (wav.sampleRate != WHISPER_SAMPLE_RATE) { - fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000); - return 8; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str()); - return 9; - } - - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); - - std::vector pcm16; - pcm16.resize(n*wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i])/32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; - } - } - - if (params.diarize) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; - } - } + if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) { + fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str()); + continue; } // print system information diff --git a/examples/command.wasm/CMakeLists.txt b/examples/command.wasm/CMakeLists.txt index 29ee6d9..3117e16 100644 --- a/examples/command.wasm/CMakeLists.txt +++ b/examples/command.wasm/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(${TARGET} include(DefaultTargetOptions) target_link_libraries(${TARGET} PRIVATE + common whisper ) diff --git a/examples/command.wasm/emscripten.cpp b/examples/command.wasm/emscripten.cpp index f2ba81e..1cfd063 100644 --- a/examples/command.wasm/emscripten.cpp +++ b/examples/command.wasm/emscripten.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "common.h" #include "whisper.h" #include @@ -27,24 +28,6 @@ std::string g_transcribed = ""; std::vector g_pcmf32; -static std::string trim(const std::string & s) { - std::regex e("^\\s+|\\s+$"); - return std::regex_replace(s, e, ""); -} - -static void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { - const float rc = 1.0f / (2.0f * M_PI * cutoff); - const float dt = 1.0f / sample_rate; - const float alpha = dt / (rc + dt); - - float y = data[0]; - - for (size_t i = 1; i < data.size(); i++) { - y = alpha * (y + data[i] - data[i - 1]); - data[i] = y; - } -} - // compute similarity between two strings using Levenshtein distance static float similarity(const std::string & s0, const std::string & s1) { const size_t len0 = s0.size() + 1; @@ -75,44 +58,6 @@ void command_set_status(const std::string & status) { g_status = status; } -bool command_vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { - const int n_samples = pcmf32.size(); - const int n_samples_last = (sample_rate * last_ms) / 1000; - - if (n_samples_last >= n_samples) { - // not enough samples - assume no speech - return false; - } - - if (freq_thold > 0.0f) { - high_pass_filter(pcmf32, freq_thold, sample_rate); - } - - float energy_all = 0.0f; - float energy_last = 0.0f; - - for (size_t i = 0; i < n_samples; i++) { - energy_all += fabsf(pcmf32[i]); - - if (i >= n_samples - n_samples_last) { - energy_last += fabsf(pcmf32[i]); - } - } - - energy_all /= n_samples; - energy_last /= n_samples_last; - - if (verbose) { - fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); - } - - if (energy_last > vad_thold*energy_all) { - return false; - } - - return true; -} - std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector & pcmf32, float & prob, int64_t & t_ms) { const auto t_start = std::chrono::high_resolution_clock::now(); @@ -155,7 +100,7 @@ void command_get_audio(int ms, int sample_rate, std::vector & audio) { const int64_t n_samples = (ms * sample_rate) / 1000; int64_t n_take = 0; - if (g_pcmf32.size() < n_samples) { + if (n_samples > (int) g_pcmf32.size()) { n_take = g_pcmf32.size(); } else { n_take = n_samples; @@ -187,7 +132,6 @@ void command_main(size_t index) { printf("command: using %d threads\n", wparams.n_threads); - bool is_running = true; bool have_prompt = false; bool ask_prompt = true; bool print_energy = false; @@ -233,7 +177,7 @@ void command_main(size_t index) { { command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); - if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) { + if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) { fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); command_set_status("Speech detected! Processing ..."); diff --git a/examples/command/CMakeLists.txt b/examples/command/CMakeLists.txt index 0980b8c..72d61d4 100644 --- a/examples/command/CMakeLists.txt +++ b/examples/command/CMakeLists.txt @@ -6,5 +6,5 @@ if (WHISPER_SUPPORT_SDL2) include(DefaultTargetOptions) target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS}) - target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) endif () diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 2bdaf87..c63722c 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -6,6 +6,7 @@ // ref: https://github.com/ggerganov/whisper.cpp/issues/171 // +#include "common.h" #include "whisper.h" #include @@ -357,62 +358,6 @@ void audio_async::get(int ms, std::vector & result) { /////////////////////////// -std::string trim(const std::string & s) { - std::regex e("^\\s+|\\s+$"); - return std::regex_replace(s, e, ""); -} - -void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { - const float rc = 1.0f / (2.0f * M_PI * cutoff); - const float dt = 1.0f / sample_rate; - const float alpha = dt / (rc + dt); - - float y = data[0]; - - for (size_t i = 1; i < data.size(); i++) { - y = alpha * (y + data[i] - data[i - 1]); - data[i] = y; - } -} - -bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { - const int n_samples = pcmf32.size(); - const int n_samples_last = (sample_rate * last_ms) / 1000; - - if (n_samples_last >= n_samples) { - // not enough samples - assume no speech - return false; - } - - if (freq_thold > 0.0f) { - high_pass_filter(pcmf32, freq_thold, sample_rate); - } - - float energy_all = 0.0f; - float energy_last = 0.0f; - - for (int i = 0; i < n_samples; i++) { - energy_all += fabsf(pcmf32[i]); - - if (i >= n_samples - n_samples_last) { - energy_last += fabsf(pcmf32[i]); - } - } - - energy_all /= n_samples; - energy_last /= n_samples_last; - - if (verbose) { - fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); - } - - if (energy_last > vad_thold*energy_all) { - return false; - } - - return true; -} - std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector & pcmf32, float & prob, int64_t & t_ms) { const auto t_start = std::chrono::high_resolution_clock::now(); @@ -502,7 +447,7 @@ std::vector read_allowed_commands(const std::string & fname) { std::string line; while (std::getline(ifs, line)) { - line = trim(line); + line = ::trim(line); if (line.empty()) { continue; } @@ -641,7 +586,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const audio.get(2000, pcmf32_cur); - if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { + if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); const auto t_start = std::chrono::high_resolution_clock::now(); @@ -791,7 +736,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi { audio.get(2000, pcmf32_cur); - if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { + if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); int64_t t_ms = 0; @@ -870,7 +815,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud { audio.get(2000, pcmf32_cur); - if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { + if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) { fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); int64_t t_ms = 0; diff --git a/examples/common.cpp b/examples/common.cpp new file mode 100644 index 0000000..3da76b7 --- /dev/null +++ b/examples/common.cpp @@ -0,0 +1,158 @@ +#include "common.h" + +// third-party utilities +// use your favorite implementations +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" + +#include +#include + +std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +std::string replace(const std::string & s, const std::string & from, const std::string & to) { + std::string result = s; + size_t pos = 0; + while ((pos = result.find(from, pos)) != std::string::npos) { + result.replace(pos, from.length(), to); + pos += to.length(); + } + return result; +} + +bool read_wav(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { + drwav wav; + std::vector wav_data; // used for pipe input from stdin + + if (fname == "-") { + { + uint8_t buf[1024]; + while (true) + { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + wav_data.insert(wav_data.end(), buf, buf + n); + } + } + + if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { + fprintf(stderr, "error: failed to open WAV file from stdin\n"); + return false; + } + + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + } + else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { + fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); + return false; + } + + if (wav.channels != 1 && wav.channels != 2) { + fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str()); + return false; + } + + if (stereo && wav.channels != 2) { + fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str()); + return false; + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000); + return false; + } + + if (wav.bitsPerSample != 16) { + fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str()); + return false; + } + + const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); + + std::vector pcm16; + pcm16.resize(n*wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i])/32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; + } + } + + if (stereo) { + // convert to stereo, float + pcmf32s.resize(2); + + pcmf32s[0].resize(n); + pcmf32s[1].resize(n); + for (uint64_t i = 0; i < n; i++) { + pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; + pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; + } + } + + return true; +} + +void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (int i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} diff --git a/examples/common.h b/examples/common.h new file mode 100644 index 0000000..04dd7cb --- /dev/null +++ b/examples/common.h @@ -0,0 +1,40 @@ +#pragma once + +// needs to match WHISPER_SAMPLE_RATE +#define COMMON_SAMPLE_RATE 16000 + +#include +#include + +std::string trim(const std::string & s); + +std::string replace( + const std::string & s, + const std::string & from, + const std::string & to); + +// Read WAV audio file and store the PCM data into pcmf32 +// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE +// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM +bool read_wav( + const std::string & fname, + std::vector & pcmf32, + std::vector> & pcmf32s, + bool stereo); + +// Apply a high-pass frequency filter to PCM audio +// Suppresses frequencies below cutoff Hz +void high_pass_filter( + std::vector & data, + float cutoff, + float sample_rate); + +// Basic voice activity detection (VAD) using audio energy adaptive threshold +bool vad_simple( + std::vector & pcmf32, + int sample_rate, + int last_ms, + float vad_thold, + float freq_thold, + bool verbose); + diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index c551100..1bb16f5 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp) include(DefaultTargetOptions) -target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index fbc9faf..2143070 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,9 +1,6 @@ -#include "whisper.h" +#include "common.h" -// third-party utilities -// use your favorite implementations -#define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" +#include "whisper.h" #include #include @@ -86,7 +83,7 @@ struct whisper_params { std::string model = "models/ggml-base.en.bin"; std::vector fname_inp = {}; - std::vector fname_outp = {}; + std::vector fname_out = {}; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -126,7 +123,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } - else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); } + else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; } else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; } @@ -520,91 +517,14 @@ int main(int argc, char ** argv) { for (int f = 0; f < (int) params.fname_inp.size(); ++f) { const auto fname_inp = params.fname_inp[f]; - const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f]; + const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; - std::vector pcmf32; // mono-channel F32 PCM + std::vector pcmf32; // mono-channel F32 PCM std::vector> pcmf32s; // stereo-channel F32 PCM - // WAV input - { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname_inp == "-") { - { - uint8_t buf[1024]; - while (true) - { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return 4; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); - } - else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str()); - return 5; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str()); - return 6; - } - - if (params.diarize && wav.channels != 2 && params.no_timestamps == false) { - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str()); - return 6; - } - - if (wav.sampleRate != WHISPER_SAMPLE_RATE) { - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000); - return 8; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str()); - return 9; - } - - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); - - std::vector pcm16; - pcm16.resize(n*wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i])/32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; - } - } - - if (params.diarize) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; - } - } + if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) { + fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str()); + continue; } // print system information @@ -701,34 +621,33 @@ int main(int argc, char ** argv) { // output to text file if (params.output_txt) { - const auto fname_txt = fname_outp + ".txt"; + const auto fname_txt = fname_out + ".txt"; output_txt(ctx, fname_txt.c_str()); } // output to VTT file if (params.output_vtt) { - const auto fname_vtt = fname_outp + ".vtt"; + const auto fname_vtt = fname_out + ".vtt"; output_vtt(ctx, fname_vtt.c_str()); } // output to SRT file if (params.output_srt) { - const auto fname_srt = fname_outp + ".srt"; + const auto fname_srt = fname_out + ".srt"; output_srt(ctx, fname_srt.c_str(), params); } // output to WTS file if (params.output_wts) { - const auto fname_wts = fname_outp + ".wts"; + const auto fname_wts = fname_out + ".wts"; output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE); } - // output to CSV file + // output to CSV file if (params.output_csv) { - const auto fname_csv = fname_outp + ".csv"; + const auto fname_csv = fname_out + ".csv"; output_csv(ctx, fname_csv.c_str()); } - } } diff --git a/examples/stream/CMakeLists.txt b/examples/stream/CMakeLists.txt index aeba327..525c8ac 100644 --- a/examples/stream/CMakeLists.txt +++ b/examples/stream/CMakeLists.txt @@ -6,5 +6,5 @@ if (WHISPER_SUPPORT_SDL2) include(DefaultTargetOptions) target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS}) - target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) endif () diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index b2a1824..6a83590 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -3,6 +3,7 @@ // A very quick-n-dirty implementation serving mainly as a proof of concept. // +#include "common.h" #include "whisper.h" #include @@ -365,57 +366,6 @@ void audio_async::get(int ms, std::vector & result) { /////////////////////////// -void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { - const float rc = 1.0f / (2.0f * M_PI * cutoff); - const float dt = 1.0f / sample_rate; - const float alpha = dt / (rc + dt); - - float y = data[0]; - - for (size_t i = 1; i < data.size(); i++) { - y = alpha * (y + data[i] - data[i - 1]); - data[i] = y; - } -} - -bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { - const int n_samples = pcmf32.size(); - const int n_samples_last = (sample_rate * last_ms) / 1000; - - if (n_samples_last >= n_samples) { - // not enough samples - assume no speech - return false; - } - - if (freq_thold > 0.0f) { - high_pass_filter(pcmf32, freq_thold, sample_rate); - } - - float energy_all = 0.0f; - float energy_last = 0.0f; - - for (int i = 0; i < n_samples; i++) { - energy_all += fabsf(pcmf32[i]); - - if (i >= n_samples - n_samples_last) { - energy_last += fabsf(pcmf32[i]); - } - } - - energy_all /= n_samples; - energy_last /= n_samples_last; - - if (verbose) { - fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); - } - - if (energy_last > vad_thold*energy_all) { - return false; - } - - return true; -} - int main(int argc, char ** argv) { whisper_params params; @@ -587,7 +537,7 @@ int main(int argc, char ** argv) { audio.get(2000, pcmf32_new); - if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { + if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { audio.get(params.length_ms, pcmf32); } else { std::this_thread::sleep_for(std::chrono::milliseconds(100)); diff --git a/examples/talk/CMakeLists.txt b/examples/talk/CMakeLists.txt index 6dbc2dc..f35200a 100644 --- a/examples/talk/CMakeLists.txt +++ b/examples/talk/CMakeLists.txt @@ -7,7 +7,7 @@ if (WHISPER_SUPPORT_SDL2) # TODO: this is temporary # need to export ggml symbols for MSVC, but too lazy .. - add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp) + add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../../ggml.c ../../whisper.cpp) include(DefaultTargetOptions) diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index 55cd46a..716b9d0 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -1,6 +1,7 @@ // Talk with AI // +#include "common.h" #include "whisper.h" #include "gpt-2.h" @@ -353,72 +354,6 @@ void audio_async::get(int ms, std::vector & result) { /////////////////////////// -std::string trim(const std::string & s) { - std::regex e("^\\s+|\\s+$"); - return std::regex_replace(s, e, ""); -} - -std::string replace(const std::string & s, const std::string & from, const std::string & to) { - std::string result = s; - size_t pos = 0; - while ((pos = result.find(from, pos)) != std::string::npos) { - result.replace(pos, from.length(), to); - pos += to.length(); - } - return result; -} - -void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { - const float rc = 1.0f / (2.0f * M_PI * cutoff); - const float dt = 1.0f / sample_rate; - const float alpha = dt / (rc + dt); - - float y = data[0]; - - for (size_t i = 1; i < data.size(); i++) { - y = alpha * (y + data[i] - data[i - 1]); - data[i] = y; - } -} - -bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { - const int n_samples = pcmf32.size(); - const int n_samples_last = (sample_rate * last_ms) / 1000; - - if (n_samples_last >= n_samples) { - // not enough samples - assume no speech - return false; - } - - if (freq_thold > 0.0f) { - high_pass_filter(pcmf32, freq_thold, sample_rate); - } - - float energy_all = 0.0f; - float energy_last = 0.0f; - - for (int i = 0; i < n_samples; i++) { - energy_all += fabsf(pcmf32[i]); - - if (i >= n_samples - n_samples_last) { - energy_last += fabsf(pcmf32[i]); - } - } - - energy_all /= n_samples; - energy_last /= n_samples_last; - - if (verbose) { - fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); - } - - if (energy_last > vad_thold*energy_all) { - return false; - } - - return true; -} - std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector & pcmf32, float & prob, int64_t & t_ms) { const auto t_start = std::chrono::high_resolution_clock::now(); @@ -583,7 +518,7 @@ int main(int argc, char ** argv) { { audio.get(2000, pcmf32_cur); - if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) { + if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) { fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); audio.get(params.voice_ms, pcmf32_cur);