|
|
|
@ -17,6 +17,13 @@
|
|
|
|
|
#include <fstream>
|
|
|
|
|
#include <mutex>
|
|
|
|
|
|
|
|
|
|
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
|
|
|
|
|
// Lowest is red, middle is yellow, highest is green.
|
|
|
|
|
const std::vector<std::string> k_colors = {
|
|
|
|
|
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
|
|
|
|
|
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// 500 -> 00:05.000
|
|
|
|
|
// 6000 -> 01:00.000
|
|
|
|
|
std::string to_timestamp(int64_t t) {
|
|
|
|
@ -47,6 +54,7 @@ struct whisper_params {
|
|
|
|
|
bool speed_up = false;
|
|
|
|
|
bool translate = false;
|
|
|
|
|
bool print_special = false;
|
|
|
|
|
bool print_colors = false;
|
|
|
|
|
bool no_context = true;
|
|
|
|
|
bool no_timestamps = false;
|
|
|
|
|
|
|
|
|
@ -77,6 +85,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
|
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
|
|
|
|
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
|
|
|
|
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
|
|
|
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
|
|
|
|
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
|
|
|
|
|
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
|
|
|
|
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
|
|
|
@ -109,6 +118,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
@ -656,7 +666,19 @@ int main(int argc, char ** argv) {
|
|
|
|
|
if (params.fname_out.length() > 0) {
|
|
|
|
|
fout << text;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (params.print_colors) {
|
|
|
|
|
const float p = whisper_full_get_token_p (ctx, i);
|
|
|
|
|
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
|
|
|
|
|
|
|
|
|
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
|
|
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
|
|
|
|
|
|
|
|
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), k_colors[col].c_str(), text, "\033[0m");
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
|
|
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
|
|
|
|
|
|
|
|