|
|
|
@ -59,8 +59,12 @@ struct whisper_params {
|
|
|
|
|
int32_t duration_ms = 0;
|
|
|
|
|
int32_t max_context = -1;
|
|
|
|
|
int32_t max_len = 0;
|
|
|
|
|
int32_t best_of = 5;
|
|
|
|
|
int32_t beam_size = -1;
|
|
|
|
|
|
|
|
|
|
float word_thold = 0.01f;
|
|
|
|
|
float word_thold = 0.01f;
|
|
|
|
|
float entropy_thold = 2.4f;
|
|
|
|
|
float logprob_thold = -1.0f;
|
|
|
|
|
|
|
|
|
|
bool speed_up = false;
|
|
|
|
|
bool translate = false;
|
|
|
|
@ -104,7 +108,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
|
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
|
|
|
|
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
|
|
|
|
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
|
|
|
|
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
|
|
|
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
|
|
|
|
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
|
|
|
|
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
|
|
@ -136,31 +144,35 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
|
|
|
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
fprintf(stderr, "options:\n");
|
|
|
|
|
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
|
|
|
|
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
|
|
|
|
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
|
|
|
|
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
|
|
|
|
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
|
|
|
|
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
|
|
|
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
|
|
|
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
|
|
|
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
|
|
|
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
|
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
|
|
|
|
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
|
|
|
|
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
|
|
|
|
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
|
|
|
|
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
|
|
|
|
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
|
|
|
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
|
|
|
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
|
|
|
|
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
|
|
|
|
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
|
|
|
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
|
|
|
|
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
|
|
|
|
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
|
|
|
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -235,7 +247,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
|
|
|
|
const char * text = whisper_full_get_token_text(ctx, i, j);
|
|
|
|
|
const float p = whisper_full_get_token_p (ctx, i, j);
|
|
|
|
|
|
|
|
|
|
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
|
|
|
|
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
|
|
|
|
|
|
|
|
|
|
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
|
|
|
|
}
|
|
|
|
@ -331,20 +343,19 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
|
|
|
|
|
const int n_segments = whisper_full_n_segments(ctx);
|
|
|
|
|
for (int i = 0; i < n_segments; ++i) {
|
|
|
|
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
|
|
|
|
if (text[0] == ' ')
|
|
|
|
|
text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
|
|
|
|
|
if (text[0] == ' ') {
|
|
|
|
|
text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
|
|
|
|
|
}
|
|
|
|
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
|
|
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
|
|
|
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
|
|
|
|
|
fout << 10 * t0 << ", "
|
|
|
|
|
<< 10 * t1 << ", \""
|
|
|
|
|
<< text << "\"\n";
|
|
|
|
|
|
|
|
|
|
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
|
|
|
|
|
fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text << "\"\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// karaoke video generation
|
|
|
|
|
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
|
|
|
|
// TODO: font parameter adjustments
|
|
|
|
@ -620,6 +631,8 @@ int main(int argc, char ** argv) {
|
|
|
|
|
{
|
|
|
|
|
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
|
|
|
|
|
|
|
|
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
|
|
|
|
|
|
|
|
|
wparams.print_realtime = false;
|
|
|
|
|
wparams.print_progress = params.print_progress;
|
|
|
|
|
wparams.print_timestamps = !params.no_timestamps;
|
|
|
|
@ -633,12 +646,18 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
|
|
|
|
wparams.thold_pt = params.word_thold;
|
|
|
|
|
wparams.entropy_thold = params.entropy_thold;
|
|
|
|
|
wparams.logprob_thold = params.logprob_thold;
|
|
|
|
|
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
|
|
|
|
|
|
|
|
|
wparams.speed_up = params.speed_up;
|
|
|
|
|
|
|
|
|
|
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
|
|
|
|
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
|
|
|
|
wparams.greedy.best_of = params.best_of;
|
|
|
|
|
wparams.beam_search.beam_size = params.beam_size;
|
|
|
|
|
wparams.temperature_inc = -1;
|
|
|
|
|
|
|
|
|
|
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
|
|
|
|
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
|
|
|
|
|
|
|
|
|
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
|
|
|
|
|
|
|
|
|