From c30bffc8a5cbc956bf4445bffb2759e5cc666374 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Nov 2022 20:14:52 +0200
Subject: [PATCH] ref #22 : add "duration" option

Can be used to partially process a recording
---
 examples/main/main.cpp |  5 +++++
 whisper.cpp            | 11 +++++++----
 whisper.h              |  3 ++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 5907e0b..0bac6da 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -53,6 +53,7 @@ struct whisper_params {
     int32_t n_processors = 1;
     int32_t offset_t_ms  = 0;
     int32_t offset_n     = 0;
+    int32_t duration_ms  = 0;
     int32_t max_context  = -1;
     int32_t max_len      = 0;
 
@@ -95,6 +96,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
             params.offset_t_ms = std::stoi(argv[++i]);
         } else if (arg == "-on" || arg == "--offset-n") {
             params.offset_n = std::stoi(argv[++i]);
+        } else if (arg == "-d" || arg == "--duration") {
+            params.duration_ms = std::stoi(argv[++i]);
         } else if (arg == "-mc" || arg == "--max-context") {
             params.max_context = std::stoi(argv[++i]);
         } else if (arg == "-ml" || arg == "--max-len") {
@@ -154,6 +157,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
     fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
     fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
     fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
     fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
     fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
     fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
@@ -532,6 +536,7 @@ int main(int argc, char ** argv) {
             wparams.n_threads            = params.n_threads;
             wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
             wparams.offset_ms            = params.offset_t_ms;
+            wparams.duration_ms          = params.duration_ms;
 
             wparams.token_timestamps     = params.output_wts || params.max_len > 0;
             wparams.thold_pt             = params.word_thold;
diff --git a/whisper.cpp b/whisper.cpp
index 02ab5cb..7078863 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2339,6 +2339,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                     /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
                     /*.n_max_text_ctx       =*/ 16384,
                     /*.offset_ms            =*/ 0,
+                    /*.duration_ms          =*/ 0,
 
                     /*.translate            =*/ false,
                     /*.no_context           =*/ false,
@@ -2376,6 +2377,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                     /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
                     /*.n_max_text_ctx       =*/ 16384,
                     /*.offset_ms            =*/ 0,
+                    /*.duration_ms          =*/ 0,
 
                     /*.translate            =*/ false,
                     /*.no_context           =*/ false,
@@ -2496,11 +2498,12 @@ int whisper_full(
     }
 
     const int seek_start = params.offset_ms/10;
+    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
 
     // if length of spectrogram is less than 1s (100 samples), then return
     // basically don't process anything that is less than 1s
     // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (whisper_n_len(ctx) < 100 + seek_start) {
+    if (seek_end < 100 + seek_start) {
         return 0;
     }
 
@@ -2533,7 +2536,7 @@ int whisper_full(
     // main loop
     int seek = seek_start;
     while (true) {
-        int progress_cur = (100*seek)/whisper_n_len(ctx);
+        const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
         while (progress_cur >= progress_prev + progress_step) {
             progress_prev += progress_step;
             if (params.print_progress) {
@@ -2541,7 +2544,7 @@ int whisper_full(
             }
         }
 
-        if (seek + 100 >= whisper_n_len(ctx)) {
+        if (seek + 100 >= seek_end) {
             break;
         }
 
@@ -2622,7 +2625,7 @@ int whisper_full(
                 // end of text token
                 if (token.id == whisper_token_eot(ctx)) {
                     if (result_len == 0) {
-                        if (seek + seek_delta + 100 >= whisper_n_len(ctx)) {
+                        if (seek + seek_delta + 100 >= seek_end) {
                             result_len = i + 1;
                         } else {
                             // TODO: figure out how to resolve this
diff --git a/whisper.h b/whisper.h
index 57ea5db..4c112f4 100644
--- a/whisper.h
+++ b/whisper.h
@@ -186,7 +186,8 @@ extern "C" {
 
         int n_threads;
         int n_max_text_ctx;
-        int offset_ms;
+        int offset_ms;      // start offset in ms
+        int duration_ms;    // audio duration to process in ms
 
         bool translate;
         bool no_context;