#include "common.h" // third-party utilities // use your favorite implementations #define DR_WAV_IMPLEMENTATION #include "dr_wav.h" #include #include #ifndef M_PI #define M_PI 3.14159265358979323846 #endif std::string trim(const std::string & s) { std::regex e("^\\s+|\\s+$"); return std::regex_replace(s, e, ""); } std::string replace(const std::string & s, const std::string & from, const std::string & to) { std::string result = s; size_t pos = 0; while ((pos = result.find(from, pos)) != std::string::npos) { result.replace(pos, from.length(), to); pos += to.length(); } return result; } bool read_wav(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { drwav wav; std::vector wav_data; // used for pipe input from stdin if (fname == "-") { { uint8_t buf[1024]; while (true) { const size_t n = fread(buf, 1, sizeof(buf), stdin); if (n == 0) { break; } wav_data.insert(wav_data.end(), buf, buf + n); } } if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { fprintf(stderr, "error: failed to open WAV file from stdin\n"); return false; } fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); return false; } if (wav.channels != 1 && wav.channels != 2) { fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str()); return false; } if (stereo && wav.channels != 2) { fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str()); return false; } if (wav.sampleRate != COMMON_SAMPLE_RATE) { fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000); return false; } if (wav.bitsPerSample != 16) { fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str()); return false; } const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); std::vector pcm16; pcm16.resize(n*wav.channels); drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); drwav_uninit(&wav); // convert to mono, float pcmf32.resize(n); if (wav.channels == 1) { for (uint64_t i = 0; i < n; i++) { pcmf32[i] = float(pcm16[i])/32768.0f; } } else { for (uint64_t i = 0; i < n; i++) { pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; } } if (stereo) { // convert to stereo, float pcmf32s.resize(2); pcmf32s[0].resize(n); pcmf32s[1].resize(n); for (uint64_t i = 0; i < n; i++) { pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; } } return true; } void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { const float rc = 1.0f / (2.0f * M_PI * cutoff); const float dt = 1.0f / sample_rate; const float alpha = dt / (rc + dt); float y = data[0]; for (size_t i = 1; i < data.size(); i++) { y = alpha * (y + data[i] - data[i - 1]); data[i] = y; } } bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { const int n_samples = pcmf32.size(); const int n_samples_last = (sample_rate * last_ms) / 1000; if (n_samples_last >= n_samples) { // not enough samples - assume no speech return false; } if (freq_thold > 0.0f) { high_pass_filter(pcmf32, freq_thold, sample_rate); } float energy_all = 0.0f; float energy_last = 0.0f; for (int i = 0; i < n_samples; i++) { energy_all += fabsf(pcmf32[i]); if (i >= n_samples - n_samples_last) { energy_last += fabsf(pcmf32[i]); } } energy_all /= n_samples; energy_last /= n_samples_last; if (verbose) { fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); } if (energy_last > vad_thold*energy_all) { return false; } return true; }