|
|
|
@ -84,6 +84,7 @@ struct whisper_params {
|
|
|
|
|
std::string model = "models/ggml-base.en.bin";
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> fname_inp = {};
|
|
|
|
|
std::vector<std::string> fname_outp = {};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
@ -121,6 +122,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
|
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
|
|
|
|
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
|
|
|
|
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
|
|
|
|
else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
|
|
|
|
|
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
|
|
|
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
|
|
|
|
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
|
|
|
@ -144,35 +146,36 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
|
|
|
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
fprintf(stderr, "options:\n");
|
|
|
|
|
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
|
|
|
|
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
|
|
|
|
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
|
|
|
|
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
|
|
|
|
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
|
|
|
|
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
|
|
|
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
|
|
|
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
|
|
|
|
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
|
|
|
|
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
|
|
|
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
|
|
|
|
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
|
|
|
|
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
|
|
|
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
|
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
|
|
|
|
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
|
|
|
|
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
|
|
|
|
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
|
|
|
|
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
|
|
|
|
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
|
|
|
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
|
|
|
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
|
|
|
|
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
|
|
|
|
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
|
|
|
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
|
|
|
|
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
|
|
|
|
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
|
|
|
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
|
|
|
|
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
|
|
|
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -514,6 +517,7 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
|
|
|
|
const auto fname_inp = params.fname_inp[f];
|
|
|
|
|
const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
|
|
|
|
|
|
|
|
|
|
std::vector<float> pcmf32; // mono-channel F32 PCM
|
|
|
|
|
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
|
|
|
@ -692,31 +696,31 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
// output to text file
|
|
|
|
|
if (params.output_txt) {
|
|
|
|
|
const auto fname_txt = fname_inp + ".txt";
|
|
|
|
|
const auto fname_txt = fname_outp + ".txt";
|
|
|
|
|
output_txt(ctx, fname_txt.c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// output to VTT file
|
|
|
|
|
if (params.output_vtt) {
|
|
|
|
|
const auto fname_vtt = fname_inp + ".vtt";
|
|
|
|
|
const auto fname_vtt = fname_outp + ".vtt";
|
|
|
|
|
output_vtt(ctx, fname_vtt.c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// output to SRT file
|
|
|
|
|
if (params.output_srt) {
|
|
|
|
|
const auto fname_srt = fname_inp + ".srt";
|
|
|
|
|
const auto fname_srt = fname_outp + ".srt";
|
|
|
|
|
output_srt(ctx, fname_srt.c_str(), params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// output to WTS file
|
|
|
|
|
if (params.output_wts) {
|
|
|
|
|
const auto fname_wts = fname_inp + ".wts";
|
|
|
|
|
const auto fname_wts = fname_outp + ".wts";
|
|
|
|
|
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// output to CSV file
|
|
|
|
|
if (params.output_csv) {
|
|
|
|
|
const auto fname_csv = fname_inp + ".csv";
|
|
|
|
|
const auto fname_csv = fname_outp + ".csv";
|
|
|
|
|
output_csv(ctx, fname_csv.c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|