output json with confidence

pull/179/head
Johan Ju 3 years ago
parent 37422ed733
commit b215f692dc

@ -112,6 +112,7 @@ options:
-otxt, --output-txt output result in a text file -otxt, --output-txt output result in a text file
-ovtt, --output-vtt output result in a vtt file -ovtt, --output-vtt output result in a vtt file
-osrt, --output-srt output result in a srt file -osrt, --output-srt output result in a srt file
-ojson --output-json output result with confidence in a json file
-owts, --output-words output script for generating karaoke video -owts, --output-words output script for generating karaoke video
-ps, --print_special print special tokens -ps, --print_special print special tokens
-pc, --print_colors print colors -pc, --print_colors print colors
@ -314,14 +315,14 @@ to highlight words with high or low confidence:
## Controlling the length of the generated text segments (experimental) ## Controlling the length of the generated text segments (experimental)
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
```java ```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
whisper_model_load: loading model from './models/ggml-base.en.bin' whisper_model_load: loading model from './models/ggml-base.en.bin'
... ...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ... main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -345,11 +346,11 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use
whisper_model_load: loading model from './models/ggml-base.en.bin' whisper_model_load: loading model from './models/ggml-base.en.bin'
... ...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ... main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:00.320] [00:00:00.000 --> 00:00:00.320]
[00:00:00.320 --> 00:00:00.370] And [00:00:00.320 --> 00:00:00.370] And
[00:00:00.370 --> 00:00:00.690] so [00:00:00.370 --> 00:00:00.690] so
[00:00:00.690 --> 00:00:00.850] my [00:00:00.690 --> 00:00:00.850] my

@ -65,6 +65,7 @@ struct whisper_params {
bool output_txt = false; bool output_txt = false;
bool output_vtt = false; bool output_vtt = false;
bool output_srt = false; bool output_srt = false;
bool output_json = false;
bool output_wts = false; bool output_wts = false;
bool print_special_tokens = false; bool print_special_tokens = false;
bool print_colors = false; bool print_colors = false;
@ -126,6 +127,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
params.output_srt = true; params.output_srt = true;
} else if (arg == "-owts" || arg == "--output-words") { } else if (arg == "-owts" || arg == "--output-words") {
params.output_wts = true; params.output_wts = true;
} else if (arg == "-ojson" || arg == "--output-json") {
params.output_json = true;
} else if (arg == "-ps" || arg == "--print_special") { } else if (arg == "-ps" || arg == "--print_special") {
params.print_special_tokens = true; params.print_special_tokens = true;
} else if (arg == "-pc" || arg == "--print_colors") { } else if (arg == "-pc" || arg == "--print_colors") {
@ -170,6 +173,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf(stderr, " -otxt, --output-txt output result in a text file\n"); fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n"); fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n");
fprintf(stderr, " -osrt, --output-srt output result in a srt file\n"); fprintf(stderr, " -osrt, --output-srt output result in a srt file\n");
fprintf(stderr, " -ojson, --output-json output result with confidence in a json file\n");
fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n"); fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n");
fprintf(stderr, " -ps, --print_special print special tokens\n"); fprintf(stderr, " -ps, --print_special print special tokens\n");
fprintf(stderr, " -pc, --print_colors print colors\n"); fprintf(stderr, " -pc, --print_colors print colors\n");
@ -307,6 +311,67 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
fout << text << "\n\n"; fout << text << "\n\n";
} }
return true;
}
bool output_json(struct whisper_context * ctx, const char * fname, whisper_params params) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
return 9;
}
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
fout << "[\n";
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
fout << "{\"start\":" << t0 << ", \"end\": " << t1;
fout << ", \"text\": [";
int n_token = whisper_full_n_tokens(ctx, i);
if (whisper_full_get_token_id(ctx, i, n_token - 1) >= whisper_token_eot(ctx)) {
--n_token;
}
for (int j = 0; j < n_token; ++j) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
const char * text = whisper_full_get_token_text(ctx, i, j);
size_t len = strlen(text) + 1;
char esc[len * 2];
memset (esc, 0, len * 2);
memcpy(esc, text, len);
size_t off = 0;
for (size_t tp = 0; text[tp] != '\0'; tp++) {
if (text[tp] == '"') {
esc[tp + off] = '\\';
++off;
}
esc[tp + off] = text[tp];
}
const float p = whisper_full_get_token_p (ctx, i, j);
const int conf = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
fout << "[" << conf << ", \"" << esc << "\"]";
if (j != n_token - 1){
fout << ",";
}
}
fout << "";
fout << "]}";
if (i != n_segments - 1){
fout << ",";
}
fout << "\n";
}
fout << "]";
return true; return true;
} }
@ -603,11 +668,18 @@ int main(int argc, char ** argv) {
output_srt(ctx, fname_srt.c_str(), params); output_srt(ctx, fname_srt.c_str(), params);
} }
// output to JSON file
if (params.output_json) {
const auto fname_json = fname_inp + ".json";
output_json(ctx, fname_json.c_str(), params);
}
// output to WTS file // output to WTS file
if (params.output_wts) { if (params.output_wts) {
const auto fname_wts = fname_inp + ".wts"; const auto fname_wts = fname_inp + ".wts";
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE); output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
} }
} }
} }

Loading…
Cancel
Save