diff --git a/stream.cpp b/stream.cpp index 86b09a0..da21098 100644 --- a/stream.cpp +++ b/stream.cpp @@ -17,6 +17,34 @@ #include #include #include +#include + +// vad settings +const double MIN_ENERGY = 0.9; +const double INCREASE_FACTOR = 0.0008; +const double FRAME_MARGIN = 5; + +// vad algorithm support variables +// TODO struct +size_t n_frame = 0; +double emin, emax; +double delta; +size_t margin_frame_counter; + +double vad(float *buffer, size_t packet_length, size_t n_frame, size_t *mf_counter, double *emin, double *emax, double *delta) +{ + + // calculate energy of current frame (RMSE) + double current_energy = 0; + for (int i = 0; i < packet_length; i++) + { + current_energy += pow((double)buffer[i], 2); + } + + current_energy = sqrt(current_energy / (double)packet_length); + printf("\ncurrent energy: %f\n", current_energy); + return current_energy; +} // 500 -> 00:05.000 // 6000 -> 01:00.000 @@ -260,8 +288,8 @@ int main(int argc, char ** argv) { while (SDL_GetQueuedAudioSize(g_dev_id_in) < n_samples*sizeof(float)) { SDL_Delay(1); } - const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float); + const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float); // take one second from previous iteration //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new)); @@ -280,8 +308,10 @@ int main(int argc, char ** argv) { pcmf32_old = pcmf32; - // run the inference - { + double vad_val; + vad_val = vad(pcmf32.data(), pcmf32.size(), n_frame++, &margin_frame_counter, &emin, &emax, &delta); + if (vad_val > 0.2f) // if vad says it's a voice packet... + { // run the inference whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY); wparams.print_progress = false;