|
|
|
@ -321,12 +321,11 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// word-level timestamps (experimental)
|
|
|
|
|
// TODO: probably still has bugs, needs refactoring, etc..
|
|
|
|
|
// TODO: auto threshold
|
|
|
|
|
// TODO: make ffmpeg output optional
|
|
|
|
|
// TODO: extra pass to detect unused speech and assign to tokens
|
|
|
|
|
// TODO: font parameter adjustments
|
|
|
|
|
// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
|
|
|
|
|
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
|
|
|
|
|
if (params.output_wts) {
|
|
|
|
|
std::vector<float> pcm_avg(pcmf32.size(), 0);
|
|
|
|
|
|
|
|
|
|
// average the fabs of the signal
|
|
|
|
@ -421,7 +420,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
tokens[j].ptsum = token.ptsum;
|
|
|
|
|
|
|
|
|
|
tokens[j].text = whisper_token_to_str(ctx, token.id);
|
|
|
|
|
//tokens[j].vlen = tokens[j].pt;
|
|
|
|
|
tokens[j].vlen = voice_length(tokens[j].text);
|
|
|
|
|
|
|
|
|
|
if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
|
|
|
|
@ -439,6 +437,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
|
|
|
|
|
t_last = t1;
|
|
|
|
|
|
|
|
|
|
// find intervals of tokens with unknown timestamps
|
|
|
|
|
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
|
|
|
|
|
{
|
|
|
|
|
int p0 = 0;
|
|
|
|
|
int p1 = 0;
|
|
|
|
|
while (true) {
|
|
|
|
@ -460,10 +461,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
|
|
|
|
|
const double dt = tokens[p1].t1 - tokens[p0].t0;
|
|
|
|
|
|
|
|
|
|
// split the time proportionally to the voice length
|
|
|
|
|
for (int j = p0 + 1; j <= p1; j++) {
|
|
|
|
|
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
|
|
|
|
|
//const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
|
|
|
|
|
//const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);
|
|
|
|
|
|
|
|
|
|
tokens[j - 1].t1 = ct;
|
|
|
|
|
tokens[j ].t0 = ct;
|
|
|
|
@ -476,7 +476,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// fix up (just in case)
|
|
|
|
|
for (int j = 0; j < n - 1; j++) {
|
|
|
|
|
if (tokens[j].t1 < 0) {
|
|
|
|
|
tokens[j + 1].t0 = tokens[j].t1;
|
|
|
|
@ -494,6 +496,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// VAD
|
|
|
|
|
// expand or contract tokens based on voice activity
|
|
|
|
|
{
|
|
|
|
|
const int hw = WHISPER_SAMPLE_RATE/8;
|
|
|
|
|
|
|
|
|
@ -565,6 +568,8 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// fixed token expand (optional)
|
|
|
|
|
{
|
|
|
|
|
const int t_expand = 0;
|
|
|
|
|
|
|
|
|
|
for (int j = 0; j < n; j++) {
|
|
|
|
@ -575,7 +580,10 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
tokens[j].t1 = tokens[j].t1 + t_expand;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// debug info
|
|
|
|
|
// TODO: toggle via parameter
|
|
|
|
|
for (int j = 0; j < n; ++j) {
|
|
|
|
|
const auto & token = tokens[j];
|
|
|
|
|
const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
|
|
|
|
@ -591,6 +599,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: become parameters
|
|
|
|
|
static const int line_wrap = 60;
|
|
|
|
|
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
|
|
|
|
|
|
|
|
@ -686,7 +695,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
|
|
|
fout.close();
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|