|
|
@ -36,7 +36,8 @@ struct whisper_params {
|
|
|
|
|
|
|
|
|
|
|
|
std::string language = "en";
|
|
|
|
std::string language = "en";
|
|
|
|
std::string model = "models/ggml-base.en.bin";
|
|
|
|
std::string model = "models/ggml-base.en.bin";
|
|
|
|
std::string fname_inp = "samples/jfk.wav";
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> fname_inp = {};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
@ -45,6 +46,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
std::string arg = argv[i];
|
|
|
|
std::string arg = argv[i];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (arg[0] != '-') {
|
|
|
|
|
|
|
|
params.fname_inp.push_back(arg);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (arg == "-s" || arg == "--seed") {
|
|
|
|
if (arg == "-s" || arg == "--seed") {
|
|
|
|
params.seed = std::stoi(argv[++i]);
|
|
|
|
params.seed = std::stoi(argv[++i]);
|
|
|
|
} else if (arg == "-t" || arg == "--threads") {
|
|
|
|
} else if (arg == "-t" || arg == "--threads") {
|
|
|
@ -67,7 +73,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
} else if (arg == "-m" || arg == "--model") {
|
|
|
|
} else if (arg == "-m" || arg == "--model") {
|
|
|
|
params.model = argv[++i];
|
|
|
|
params.model = argv[++i];
|
|
|
|
} else if (arg == "-f" || arg == "--file") {
|
|
|
|
} else if (arg == "-f" || arg == "--file") {
|
|
|
|
params.fname_inp = argv[++i];
|
|
|
|
params.fname_inp.push_back(argv[++i]);
|
|
|
|
} else if (arg == "-h" || arg == "--help") {
|
|
|
|
} else if (arg == "-h" || arg == "--help") {
|
|
|
|
whisper_print_usage(argc, argv, params);
|
|
|
|
whisper_print_usage(argc, argv, params);
|
|
|
|
exit(0);
|
|
|
|
exit(0);
|
|
|
@ -83,7 +89,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
|
|
|
|
|
|
|
|
|
|
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
|
|
|
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
|
|
|
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "options:\n");
|
|
|
|
fprintf(stderr, "options:\n");
|
|
|
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
|
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
|
@ -95,7 +101,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
|
|
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
|
|
|
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
|
|
|
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path (default: %s)\n", params.fname_inp.c_str());
|
|
|
|
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path\n");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -110,32 +116,41 @@ int main(int argc, char ** argv) {
|
|
|
|
params.seed = time(NULL);
|
|
|
|
params.seed = time(NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (params.fname_inp.empty()) {
|
|
|
|
|
|
|
|
fprintf(stderr, "error: no input files specified\n");
|
|
|
|
|
|
|
|
whisper_print_usage(argc, argv, params);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// whisper init
|
|
|
|
// whisper init
|
|
|
|
|
|
|
|
|
|
|
|
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
|
|
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
|
|
|
|
|
|
|
const auto fname_inp = params.fname_inp[f];
|
|
|
|
|
|
|
|
|
|
|
|
// WAV input
|
|
|
|
// WAV input
|
|
|
|
std::vector<float> pcmf32;
|
|
|
|
std::vector<float> pcmf32;
|
|
|
|
{
|
|
|
|
{
|
|
|
|
drwav wav;
|
|
|
|
drwav wav;
|
|
|
|
if (!drwav_init_file(&wav, params.fname_inp.c_str(), NULL)) {
|
|
|
|
if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
|
|
|
|
fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], params.fname_inp.c_str());
|
|
|
|
fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
|
|
|
|
whisper_print_usage(argc, argv, {});
|
|
|
|
whisper_print_usage(argc, argv, {});
|
|
|
|
return 2;
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (wav.channels != 1 && wav.channels != 2) {
|
|
|
|
if (wav.channels != 1 && wav.channels != 2) {
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], params.fname_inp.c_str());
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
|
|
|
|
return 3;
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
|
|
|
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], params.fname_inp.c_str());
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
|
|
|
|
return 4;
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (wav.bitsPerSample != 16) {
|
|
|
|
if (wav.bitsPerSample != 16) {
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], params.fname_inp.c_str());
|
|
|
|
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
|
|
|
|
return 5;
|
|
|
|
return 5;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -169,8 +184,8 @@ int main(int argc, char ** argv) {
|
|
|
|
printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
|
|
|
printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf("%s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
|
|
|
printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
|
|
|
__func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
|
|
|
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
|
|
|
params.language.c_str(),
|
|
|
|
params.language.c_str(),
|
|
|
|
params.translate ? "translate" : "transcribe",
|
|
|
|
params.translate ? "translate" : "transcribe",
|
|
|
|
params.no_timestamps ? 0 : 1);
|
|
|
|
params.no_timestamps ? 0 : 1);
|
|
|
@ -214,6 +229,7 @@ int main(int argc, char ** argv) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
whisper_print_timings(ctx);
|
|
|
|
whisper_print_timings(ctx);
|
|
|
|
whisper_free(ctx);
|
|
|
|
whisper_free(ctx);
|
|
|
|