diff --git a/README.md b/README.md index 5a20073..28e57a6 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ options: -otxt, --output-txt output result in a text file -ovtt, --output-vtt output result in a vtt file -osrt, --output-srt output result in a srt file + -ojson --output-json output result with confidence in a json file -owts, --output-words output script for generating karaoke video -ps, --print_special print special tokens -pc, --print_colors print colors @@ -314,14 +315,14 @@ to highlight words with high or low confidence: ## Controlling the length of the generated text segments (experimental) -For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: +For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: ```java ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16 whisper_model_load: loading model from './models/ggml-base.en.bin' ... -system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | +system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ... @@ -345,11 +346,11 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use whisper_model_load: loading model from './models/ggml-base.en.bin' ... -system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | +system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ... -[00:00:00.000 --> 00:00:00.320] +[00:00:00.000 --> 00:00:00.320] [00:00:00.320 --> 00:00:00.370] And [00:00:00.370 --> 00:00:00.690] so [00:00:00.690 --> 00:00:00.850] my diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a1b9825..6f00285 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -65,6 +65,7 @@ struct whisper_params { bool output_txt = false; bool output_vtt = false; bool output_srt = false; + bool output_json = false; bool output_wts = false; bool print_special_tokens = false; bool print_colors = false; @@ -126,6 +127,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { params.output_srt = true; } else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; + } else if (arg == "-ojson" || arg == "--output-json") { + params.output_json = true; } else if (arg == "-ps" || arg == "--print_special") { params.print_special_tokens = true; } else if (arg == "-pc" || arg == "--print_colors") { @@ -170,6 +173,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params) fprintf(stderr, " -otxt, --output-txt output result in a text file\n"); fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n"); fprintf(stderr, " -osrt, --output-srt output result in a srt file\n"); + fprintf(stderr, " -ojson, --output-json output result with confidence in a json file\n"); fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n"); fprintf(stderr, " -ps, --print_special print special tokens\n"); fprintf(stderr, " -pc, --print_colors print colors\n"); @@ -307,6 +311,67 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_ fout << text << "\n\n"; } + return true; +} + +bool output_json(struct whisper_context * ctx, const char * fname, whisper_params params) { + std::ofstream fout(fname); + if (!fout.is_open()) { + fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname); + return 9; + } + + fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); + + fout << "[\n"; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + fout << "{\"start\":" << t0 << ", \"end\": " << t1; + fout << ", \"text\": ["; + int n_token = whisper_full_n_tokens(ctx, i); + if (whisper_full_get_token_id(ctx, i, n_token - 1) >= whisper_token_eot(ctx)) { + --n_token; + } + for (int j = 0; j < n_token; ++j) { + const whisper_token id = whisper_full_get_token_id(ctx, i, j); + if (id >= whisper_token_eot(ctx)) { + continue; + } + const char * text = whisper_full_get_token_text(ctx, i, j); + size_t len = strlen(text) + 1; + char esc[len * 2]; + memset (esc, 0, len * 2); + memcpy(esc, text, len); + size_t off = 0; + for (size_t tp = 0; text[tp] != '\0'; tp++) { + if (text[tp] == '"') { + esc[tp + off] = '\\'; + ++off; + } + esc[tp + off] = text[tp]; + } + const float p = whisper_full_get_token_p (ctx, i, j); + const int conf = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size())))); + fout << "[" << conf << ", \"" << esc << "\"]"; + if (j != n_token - 1){ + fout << ","; + } + } + fout << ""; + fout << "]}"; + if (i != n_segments - 1){ + fout << ","; + } + fout << "\n"; + } + fout << "]"; + + + + return true; } @@ -603,11 +668,18 @@ int main(int argc, char ** argv) { output_srt(ctx, fname_srt.c_str(), params); } + // output to JSON file + if (params.output_json) { + const auto fname_json = fname_inp + ".json"; + output_json(ctx, fname_json.c_str(), params); + } + // output to WTS file if (params.output_wts) { const auto fname_wts = fname_inp + ".wts"; output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE); } + } }