|
|
|
@ -62,6 +62,7 @@ struct whisper_params {
|
|
|
|
|
|
|
|
|
|
float word_thold = 0.01f;
|
|
|
|
|
|
|
|
|
|
bool tokens = false;
|
|
|
|
|
bool speed_up = false;
|
|
|
|
|
bool translate = false;
|
|
|
|
|
bool diarize = false;
|
|
|
|
@ -102,6 +103,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
|
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
|
|
|
|
else if (arg == "-tk" || arg == "--tokens") { params.tokens = true; }
|
|
|
|
|
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
|
|
|
|
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
|
|
|
|
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
|
|
@ -139,6 +141,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
|
|
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
|
|
|
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
|
|
|
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
|
|
|
|
fprintf(stderr, " -tk, --tokens [%-7s] outputs token level timestamps\n", params.tokens ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
@ -155,6 +158,28 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool utf8_check_is_valid(const char* str) {
|
|
|
|
|
for (int i = 0; str[i]; i++) {
|
|
|
|
|
int n = 0;
|
|
|
|
|
unsigned char c = str[i];
|
|
|
|
|
if (0x00 <= c && c <= 0x7f) n = 0; // 0bbbbbbb
|
|
|
|
|
else if ((c & 0xE0) == 0xC0) n = 1; // 110bbbbb
|
|
|
|
|
else if (c == 0xED) {
|
|
|
|
|
if (!str[++i]) return false;
|
|
|
|
|
if (((unsigned char)(str[i]) & 0xA0) == 0xA0) return false; //U+d800 to U+dfff
|
|
|
|
|
}
|
|
|
|
|
else if ((c & 0xF0) == 0xE0) n = 2; // 1110bbbb
|
|
|
|
|
else if ((c & 0xF8) == 0xF0) n = 3; // 11110bbb
|
|
|
|
|
//else if (($c & 0xFC) == 0xF8) n = 4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
|
|
|
|
|
//else if (($c & 0xFE) == 0xFC) n = 5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
|
|
|
|
|
else return false;
|
|
|
|
|
for (int j = 0; j < n; j++) // n bytes matching 10bbbbbb follow ?
|
|
|
|
|
if ((str[++i] & 0xC0) != 0x80)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct whisper_print_user_data {
|
|
|
|
|
const whisper_params * params;
|
|
|
|
|
|
|
|
|
@ -245,6 +270,36 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
|
|
|
|
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
|
|
|
|
}
|
|
|
|
|
printf("\n");
|
|
|
|
|
} else if (params.tokens) {
|
|
|
|
|
bool continued = false;
|
|
|
|
|
int64_t token_t0;
|
|
|
|
|
std::string utf_text;
|
|
|
|
|
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
|
|
|
|
if (params.print_special == false) {
|
|
|
|
|
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
|
|
|
|
if (id >= whisper_token_eot(ctx)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const char * text = whisper_full_get_token_text(ctx, i, j);
|
|
|
|
|
const whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
|
|
|
|
if (continued) {
|
|
|
|
|
utf_text += text;
|
|
|
|
|
} else {
|
|
|
|
|
utf_text = text;
|
|
|
|
|
token_t0 = token.t0;
|
|
|
|
|
}
|
|
|
|
|
if (utf8_check_is_valid(utf_text.c_str())) {
|
|
|
|
|
continued = false;
|
|
|
|
|
} else {
|
|
|
|
|
continued = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
printf("[%s --> %s] %s%s\n", to_timestamp(token_t0).c_str(), to_timestamp(token.t1).c_str(), speaker.c_str(), utf_text.c_str());
|
|
|
|
|
}
|
|
|
|
|
printf("\n");
|
|
|
|
|
} else {
|
|
|
|
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
|
|
|
|
|
|
|
|
@ -593,7 +648,7 @@ int main(int argc, char ** argv) {
|
|
|
|
|
wparams.offset_ms = params.offset_t_ms;
|
|
|
|
|
wparams.duration_ms = params.duration_ms;
|
|
|
|
|
|
|
|
|
|
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
|
|
|
|
wparams.token_timestamps = params.tokens || params.output_wts || params.max_len > 0;
|
|
|
|
|
wparams.thold_pt = params.word_thold;
|
|
|
|
|
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
|
|
|
|
|
|
|
|
|