ADD VAD to save inference cycles

3 years ago · b211d2f9b1
parent 167324584b
commit b211d2f9b1
1 changed files with 33 additions and 3 deletions
--- a/stream.cpp
+++ b/stream.cpp
@ -17,6 +17,34 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <math.h>
+
+// vad settings
+const double MIN_ENERGY = 0.9;
+const double INCREASE_FACTOR = 0.0008;
+const double FRAME_MARGIN = 5;
+
+// vad algorithm support variables
+// TODO struct
+size_t n_frame = 0;
+double emin, emax;
+double delta;
+size_t margin_frame_counter;
+
+double vad(float *buffer, size_t packet_length, size_t n_frame, size_t *mf_counter, double *emin, double *emax, double *delta)
+{
+
+    // calculate energy of current frame (RMSE)
+    double current_energy = 0;
+    for (int i = 0; i < packet_length; i++)
+    {
+        current_energy += pow((double)buffer[i], 2);
+    }
+
+    current_energy = sqrt(current_energy / (double)packet_length);
+    printf("\ncurrent energy: %f\n", current_energy);
+    return current_energy;
+}

 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -241,8 +269,8 @@ int main(int argc, char ** argv) {
        while (SDL_GetQueuedAudioSize(g_dev_id_in) < 3*WHISPER_SAMPLE_RATE*sizeof(float)) {
            SDL_Delay(1);
        }
-        const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);

+        const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
        // take one second from previous iteration
        // TODO: better strategy
        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
@ -259,8 +287,10 @@ int main(int argc, char ** argv) {

        pcmf32_old = pcmf32;

-        // run the inference
-        {
+        double vad_val;
+        vad_val = vad(pcmf32.data(), pcmf32.size(), n_frame++, &margin_frame_counter, &emin, &emax, &delta);
+        if (vad_val > 0.2f) // if vad says it's a voice packet...
+        {  // run the inference
            whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);

            wparams.print_progress       = false;