Merge pull request #1 from rimoapp/awakia/token_split

Enable to split by token
3 years ago · 340378cc40
parent 124c718c73 1c2d35faf2
commit 340378cc40
1 changed files with 56 additions and 1 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -62,6 +62,7 @@ struct whisper_params {

    float word_thold = 0.01f;

+    bool tokens        = false;
    bool speed_up      = false;
    bool translate     = false;
    bool diarize       = false;
@ -102,6 +103,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-mc"   || arg == "--max-context")   { params.max_context   = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")       { params.max_len       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")    { params.word_thold    = std::stof(argv[++i]); }
+        else if (arg == "-tk"   || arg == "--tokens")        { params.tokens        = true; }
        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-di"   || arg == "--diarize")       { params.diarize       = true; }
@ -139,6 +141,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
    fprintf(stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n",           params.max_len);
    fprintf(stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -tk,      --tokens        [%-7s] outputs token level timestamps\n",                 params.tokens ? "true" : "false");
    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@ -155,6 +158,28 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "\n");
 }

+bool utf8_check_is_valid(const char* str) {
+    for (int i = 0; str[i]; i++) {
+        int n = 0;
+        unsigned char c = str[i];
+        if (0x00 <= c && c <= 0x7f) n = 0; // 0bbbbbbb
+        else if ((c & 0xE0) == 0xC0) n = 1; // 110bbbbb
+        else if (c == 0xED) {
+          if (!str[++i]) return false;
+          if (((unsigned char)(str[i]) & 0xA0) == 0xA0) return false; //U+d800 to U+dfff
+        }
+        else if ((c & 0xF0) == 0xE0) n = 2; // 1110bbbb
+        else if ((c & 0xF8) == 0xF0) n = 3; // 11110bbb
+        //else if (($c & 0xFC) == 0xF8) n = 4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
+        //else if (($c & 0xFE) == 0xFC) n = 5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
+        else return false;
+        for (int j = 0; j < n; j++) // n bytes matching 10bbbbbb follow ?
+            if ((str[++i] & 0xC0) != 0x80)
+                return false;
+    }
+    return true;
+}
+
 struct whisper_print_user_data {
    const whisper_params * params;

@ -245,6 +270,36 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
+            } else if (params.tokens) {
+                bool continued = false;
+                int64_t token_t0;
+                std::string utf_text;
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
+                    }
+
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
+                    if (continued) {
+                      utf_text += text;
+                    } else {
+                      utf_text = text;
+                      token_t0 = token.t0;
+                    }
+                    if (utf8_check_is_valid(utf_text.c_str())) {
+                      continued = false;
+                    } else {
+                      continued = true;
+                      continue;
+                    }
+
+                    printf("[%s --> %s]  %s%s\n", to_timestamp(token_t0).c_str(), to_timestamp(token.t1).c_str(), speaker.c_str(), utf_text.c_str());
+                }
+                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);

@ -593,7 +648,7 @@ int main(int argc, char ** argv) {
            wparams.offset_ms        = params.offset_t_ms;
            wparams.duration_ms      = params.duration_ms;

-            wparams.token_timestamps = params.output_wts || params.max_len > 0;
+            wparams.token_timestamps = params.tokens || params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;