@ -36,6 +36,7 @@ std::string to_timestamp(int64_t t, bool comma = false) {
return std : : string ( buf ) ;
}
// helper function to replace substrings
void replace_all ( std : : string & s , const std : : string & search , const std : : string & replace ) {
for ( size_t pos = 0 ; ; pos + = replace . length ( ) ) {
pos = s . find ( search , pos ) ;
@ -45,31 +46,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
}
}
// a cost-function that is high for text that takes longer to pronounce
float voice_length ( const std : : string & text ) {
float res = 0.0f ;
for ( size_t i = 0 ; i < text . size ( ) ; + + i ) {
if ( text [ i ] = = ' ' ) {
res + = 0.01f ;
} else if ( text [ i ] = = ' , ' ) {
res + = 2.00f ;
} else if ( text [ i ] = = ' . ' ) {
res + = 3.00f ;
} else if ( text [ i ] = = ' ! ' ) {
res + = 3.00f ;
} else if ( text [ i ] = = ' ? ' ) {
res + = 3.00f ;
} else if ( text [ i ] > = ' 0 ' & & text [ i ] < = ' 9 ' ) {
res + = 3.00f ;
} else {
res + = 1.00f ;
}
}
return res ;
}
// command-line parameters
struct whisper_params {
int32_t seed = - 1 ; // RNG seed, not used currently
@ -78,6 +54,7 @@ struct whisper_params {
int32_t offset_t_ms = 0 ;
int32_t offset_n = 0 ;
int32_t max_context = - 1 ;
int32_t max_len = 0 ;
float word_thold = 0.01f ;
@ -120,6 +97,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
params . offset_n = std : : stoi ( argv [ + + i ] ) ;
} else if ( arg = = " -mc " | | arg = = " --max-context " ) {
params . max_context = std : : stoi ( argv [ + + i ] ) ;
} else if ( arg = = " -ml " | | arg = = " --max-len " ) {
params . max_len = std : : stoi ( argv [ + + i ] ) ;
} else if ( arg = = " -wt " | | arg = = " --word-thold " ) {
params . word_thold = std : : stof ( argv [ + + i ] ) ;
} else if ( arg = = " -v " | | arg = = " --verbose " ) {
@ -176,13 +155,14 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf ( stderr , " -ot N, --offset-t N time offset in milliseconds (default: %d) \n " , params . offset_t_ms ) ;
fprintf ( stderr , " -on N, --offset-n N segment index offset (default: %d) \n " , params . offset_n ) ;
fprintf ( stderr , " -mc N, --max-context N maximum number of text context tokens to store (default: max) \n " ) ;
fprintf ( stderr , " -ml N, --max-len N maximum segment length in characters (default: %d) \n " , params . max_len ) ;
fprintf ( stderr , " -wt N, --word-thold N word timestamp probability threshold (default: %f) \n " , params . word_thold ) ;
fprintf ( stderr , " -v, --verbose verbose output \n " ) ;
fprintf ( stderr , " --translate translate from source language to english \n " ) ;
fprintf ( stderr , " -otxt, --output-txt output result in a text file \n " ) ;
fprintf ( stderr , " -ovtt, --output-vtt output result in a vtt file \n " ) ;
fprintf ( stderr , " -osrt, --output-srt output result in a srt file \n " ) ;
fprintf ( stderr , " -owts, --output-words output word-level timestamps to a text file \n " ) ;
fprintf ( stderr , " -owts, --output-words output script for generating karaoke video \n " ) ;
fprintf ( stderr , " -ps, --print_special print special tokens \n " ) ;
fprintf ( stderr , " -pc, --print_colors print colors \n " ) ;
fprintf ( stderr , " -nt, --no_timestamps do not print timestamps \n " ) ;
@ -192,65 +172,67 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf ( stderr , " \n " ) ;
}
void whisper_print_segment_callback ( struct whisper_context * ctx , void * user_data ) {
void whisper_print_segment_callback ( struct whisper_context * ctx , int n_new , void * user_data ) {
const whisper_params & params = * ( whisper_params * ) user_data ;
const int n_segments = whisper_full_n_segments ( ctx ) ;
// print the last segment
const int i = n_segments - 1 ;
if ( i = = 0 ) {
// print the last n_new segments
const int s0 = n_segments - n_new ;
if ( s0 = = 0 ) {
printf ( " \n " ) ;
}
if ( params . no_timestamps ) {
if ( params . print_colors ) {
for ( int j = 0 ; j < whisper_full_n_tokens ( ctx , i ) ; + + j ) {
if ( params . print_special_tokens = = false ) {
const whisper_token id = whisper_full_get_token_id ( ctx , i , j ) ;
if ( id > = whisper_token_eot ( ctx ) ) {
continue ;
for ( int i = s0 ; i < n_segments ; i + + ) {
if ( params . no_timestamps ) {
if ( params . print_colors ) {
for ( int j = 0 ; j < whisper_full_n_tokens ( ctx , i ) ; + + j ) {
if ( params . print_special_tokens = = false ) {
const whisper_token id = whisper_full_get_token_id ( ctx , i , j ) ;
if ( id > = whisper_token_eot ( ctx ) ) {
continue ;
}
}
}
const char * text = whisper_full_get_token_text ( ctx , i , j ) ;
const float p = whisper_full_get_token_p ( ctx , i , j ) ;
const char * text = whisper_full_get_token_text ( ctx , i , j ) ;
const float p = whisper_full_get_token_p ( ctx , i , j ) ;
const int col = std : : max ( 0 , std : : min ( ( int ) k_colors . size ( ) , ( int ) ( std : : pow ( p , 3 ) * float ( k_colors . size ( ) ) ) ) ) ;
const int col = std : : max ( 0 , std : : min ( ( int ) k_colors . size ( ) , ( int ) ( std : : pow ( p , 3 ) * float ( k_colors . size ( ) ) ) ) ) ;
printf ( " %s%s%s " , k_colors [ col ] . c_str ( ) , text , " \033 [0m " ) ;
printf ( " %s%s%s " , k_colors [ col ] . c_str ( ) , text , " \033 [0m " ) ;
}
} else {
const char * text = whisper_full_get_segment_text ( ctx , i ) ;
printf ( " %s " , text ) ;
}
fflush ( stdout ) ;
} else {
const char * text = whisper_full_get_segment_text ( ctx , i ) ;
printf ( " %s " , text ) ;
}
fflush ( stdout ) ;
} else {
const int64_t t0 = whisper_full_get_segment_t0 ( ctx , i ) ;
const int64_t t1 = whisper_full_get_segment_t1 ( ctx , i ) ;
if ( params . print_colors ) {
printf ( " [%s --> %s] " , to_timestamp ( t0 ) . c_str ( ) , to_timestamp ( t1 ) . c_str ( ) ) ;
for ( int j = 0 ; j < whisper_full_n_tokens ( ctx , i ) ; + + j ) {
if ( params . print_special_tokens = = false ) {
const whisper_token id = whisper_full_get_token_id ( ctx , i , j ) ;
if ( id > = whisper_token_eot ( ctx ) ) {
continue ;
const int64_t t0 = whisper_full_get_segment_t0 ( ctx , i ) ;
const int64_t t1 = whisper_full_get_segment_t1 ( ctx , i ) ;
if ( params . print_colors ) {
printf ( " [%s --> %s] " , to_timestamp ( t0 ) . c_str ( ) , to_timestamp ( t1 ) . c_str ( ) ) ;
for ( int j = 0 ; j < whisper_full_n_tokens ( ctx , i ) ; + + j ) {
if ( params . print_special_tokens = = false ) {
const whisper_token id = whisper_full_get_token_id ( ctx , i , j ) ;
if ( id > = whisper_token_eot ( ctx ) ) {
continue ;
}
}
}
const char * text = whisper_full_get_token_text ( ctx , i , j ) ;
const float p = whisper_full_get_token_p ( ctx , i , j ) ;
const char * text = whisper_full_get_token_text ( ctx , i , j ) ;
const float p = whisper_full_get_token_p ( ctx , i , j ) ;
const int col = std : : max ( 0 , std : : min ( ( int ) k_colors . size ( ) , ( int ) ( std : : pow ( p , 3 ) * float ( k_colors . size ( ) ) ) ) ) ;
const int col = std : : max ( 0 , std : : min ( ( int ) k_colors . size ( ) , ( int ) ( std : : pow ( p , 3 ) * float ( k_colors . size ( ) ) ) ) ) ;
printf ( " %s%s%s " , k_colors [ col ] . c_str ( ) , text , " \033 [0m " ) ;
}
printf ( " \n " ) ;
} else {
const char * text = whisper_full_get_segment_text ( ctx , i ) ;
printf ( " %s%s%s " , k_colors [ col ] . c_str ( ) , text , " \033 [0m " ) ;
}
printf ( " \n " ) ;
} else {
const char * text = whisper_full_get_segment_text ( ctx , i ) ;
printf ( " [%s --> %s] %s \n " , to_timestamp ( t0 ) . c_str ( ) , to_timestamp ( t1 ) . c_str ( ) , text ) ;
printf ( " [%s --> %s] %s \n " , to_timestamp ( t0 ) . c_str ( ) , to_timestamp ( t1 ) . c_str ( ) , text ) ;
}
}
}
}
@ -320,297 +302,41 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
return true ;
}
// word-level timestamps (experimental)
// TODO: make ffmpeg output optional
// TODO: extra pass to detect unused speech and assign to tokens
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
bool output_wts ( struct whisper_context * ctx , const char * fname , const char * fname_inp , const whisper_params & params , const std : : vector < float > & pcmf32 ) {
std : : vector < float > pcm_avg ( pcmf32 . size ( ) , 0 ) ;
// average the fabs of the signal
{
const int hw = 32 ;
for ( int i = 0 ; i < pcmf32 . size ( ) ; i + + ) {
float sum = 0 ;
for ( int j = - hw ; j < = hw ; j + + ) {
if ( i + j > = 0 & & i + j < pcmf32 . size ( ) ) {
sum + = fabs ( pcmf32 [ i + j ] ) ;
}
}
pcm_avg [ i ] = sum / ( 2 * hw + 1 ) ;
}
}
struct token_info {
int64_t t0 = - 1 ;
int64_t t1 = - 1 ;
int64_t tt0 = - 1 ;
int64_t tt1 = - 1 ;
whisper_token id ;
whisper_token tid ;
float p = 0.0f ;
float pt = 0.0f ;
float ptsum = 0.0f ;
std : : string text ;
float vlen = 0.0f ; // voice length of this token
} ;
int64_t t_beg = 0 ;
int64_t t_last = 0 ;
whisper_token tid_last = 0 ;
bool output_wts ( struct whisper_context * ctx , const char * fname , const char * fname_inp , const whisper_params & params , float t_sec ) {
std : : ofstream fout ( fname ) ;
fprintf ( stderr , " %s: saving output to '%s' \n " , __func__ , fname ) ;
// TODO: become parameter
static const char * font = " /System/Library/Fonts/Supplemental/Courier New Bold.ttf " ;
fout < < " !/bin/bash " < < " \n " ;
fout < < " \n " ;
fout < < " ffmpeg -i " < < fname_inp < < " -f lavfi -i color=size=1200x120:duration= " < < float ( pcmf32 . size ( ) + 1000 ) / WHISPER_SAMPLE_RATE < < " :rate=25:color=black -vf \" " ;
bool is_first = true ;
fout < < " ffmpeg -i " < < fname_inp < < " -f lavfi -i color=size=1200x120:duration= " < < t_sec < < " :rate=25:color=black -vf \" " ;
for ( int i = 0 ; i < whisper_full_n_segments ( ctx ) ; i + + ) {
const int64_t t0 = whisper_full_get_segment_t0 ( ctx , i ) ;
const int64_t t1 = whisper_full_get_segment_t1 ( ctx , i ) ;
const char * text = whisper_full_get_segment_text ( ctx , i ) ;
const int s0 = std : : max ( 0 , ( int ) ( t0 * WHISPER_SAMPLE_RATE / 100 ) ) ;
const int s1 = std : : min ( ( int ) pcmf32 . size ( ) , ( int ) ( t1 * WHISPER_SAMPLE_RATE / 100 ) ) ;
const int n = whisper_full_n_tokens ( ctx , i ) ;
std : : vector < token_info > tokens ( n ) ;
if ( n < = 1 ) {
continue ;
}
std : : vector < whisper_token_data > tokens ( n ) ;
for ( int j = 0 ; j < n ; + + j ) {
struct whisper_token_data token = whisper_full_get_token_data ( ctx , i , j ) ;
if ( j = = 0 ) {
if ( token . id = = whisper_token_beg ( ctx ) ) {
tokens [ j ] . t0 = t0 ;
tokens [ j ] . t1 = t0 ;
tokens [ j + 1 ] . t0 = t0 ;
t_beg = t0 ;
t_last = t0 ;
tid_last = whisper_token_beg ( ctx ) ;
} else {
tokens [ j ] . t0 = t_last ;
}
}
const int64_t tt = t_beg + 2 * ( token . tid - whisper_token_beg ( ctx ) ) ;
tokens [ j ] . id = token . id ;
tokens [ j ] . tid = token . tid ;
tokens [ j ] . p = token . p ;
tokens [ j ] . pt = token . pt ;
tokens [ j ] . ptsum = token . ptsum ;
tokens [ j ] . text = whisper_token_to_str ( ctx , token . id ) ;
tokens [ j ] . vlen = voice_length ( tokens [ j ] . text ) ;
if ( token . pt > params . word_thold & & token . ptsum > 0.01 & & token . tid > tid_last & & tt < = t1 ) {
if ( j > 0 ) {
tokens [ j - 1 ] . t1 = tt ;
}
tokens [ j ] . t0 = tt ;
tid_last = token . tid ;
}
tokens [ j ] = whisper_full_get_token_data ( ctx , i , j ) ;
}
tokens [ n - 2 ] . t1 = t1 ;
tokens [ n - 1 ] . t0 = t1 ;
tokens [ n - 1 ] . t1 = t1 ;
t_last = t1 ;
// find intervals of tokens with unknown timestamps
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
{
int p0 = 0 ;
int p1 = 0 ;
while ( true ) {
while ( p1 < n & & tokens [ p1 ] . t1 < 0 ) {
p1 + + ;
}
if ( p1 > = n ) {
p1 - - ;
}
if ( p1 > p0 ) {
double psum = 0.0 ;
for ( int j = p0 ; j < = p1 ; j + + ) {
psum + = tokens [ j ] . vlen ;
}
//printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
const double dt = tokens [ p1 ] . t1 - tokens [ p0 ] . t0 ;
// split the time proportionally to the voice length
for ( int j = p0 + 1 ; j < = p1 ; j + + ) {
const double ct = tokens [ j - 1 ] . t0 + dt * tokens [ j - 1 ] . vlen / psum ;
tokens [ j - 1 ] . t1 = ct ;
tokens [ j ] . t0 = ct ;
}
}
p1 + + ;
p0 = p1 ;
if ( p1 > = n ) {
break ;
}
}
}
// fix up (just in case)
for ( int j = 0 ; j < n - 1 ; j + + ) {
if ( tokens [ j ] . t1 < 0 ) {
tokens [ j + 1 ] . t0 = tokens [ j ] . t1 ;
}
if ( j > 0 ) {
if ( tokens [ j - 1 ] . t1 > tokens [ j ] . t0 ) {
tokens [ j ] . t0 = tokens [ j - 1 ] . t1 ;
tokens [ j ] . t1 = std : : max ( tokens [ j ] . t0 , tokens [ j ] . t1 ) ;
}
}
tokens [ j ] . tt0 = tokens [ j ] . t0 ;
tokens [ j ] . tt1 = tokens [ j ] . t1 ;
}
// VAD
// expand or contract tokens based on voice activity
{
const int hw = WHISPER_SAMPLE_RATE / 8 ;
for ( int j = 0 ; j < n ; j + + ) {
if ( tokens [ j ] . id > = whisper_token_eot ( ctx ) ) {
continue ;
}
const int64_t t0 = tokens [ j ] . t0 ;
const int64_t t1 = tokens [ j ] . t1 ;
int s0 = std : : max ( 0 , ( int ) ( t0 * WHISPER_SAMPLE_RATE / 100 ) ) ;
int s1 = std : : min ( ( int ) pcmf32 . size ( ) - 1 , ( int ) ( t1 * WHISPER_SAMPLE_RATE / 100 ) ) ;
const int ss0 = std : : max ( 0 , ( int ) ( t0 * WHISPER_SAMPLE_RATE / 100 ) - hw ) ;
const int ss1 = std : : min ( ( int ) pcmf32 . size ( ) - 1 , ( int ) ( t1 * WHISPER_SAMPLE_RATE / 100 ) + hw ) ;
const int n = ss1 - ss0 ;
float sum = 0.0f ;
for ( int k = ss0 ; k < ss1 ; k + + ) {
sum + = pcm_avg [ k ] ;
}
const float thold = 0.5 * sum / n ;
{
int k = s0 ;
if ( pcm_avg [ k ] > thold & & j > 0 ) {
while ( k > 0 & & pcm_avg [ k ] > thold ) {
k - - ;
}
tokens [ j ] . t0 = ( int64_t ) ( 100 * k / WHISPER_SAMPLE_RATE ) ;
if ( tokens [ j ] . t0 < tokens [ j - 1 ] . t1 ) {
tokens [ j ] . t0 = tokens [ j - 1 ] . t1 ;
} else {
s0 = k ;
}
} else {
while ( pcm_avg [ k ] < thold & & k < s1 ) {
k + + ;
}
s0 = k ;
tokens [ j ] . t0 = 100 * k / WHISPER_SAMPLE_RATE ;
}
}
{
int k = s1 ;
if ( pcm_avg [ k ] > thold ) {
while ( k < ( int ) pcmf32 . size ( ) - 1 & & pcm_avg [ k ] > thold ) {
k + + ;
}
tokens [ j ] . t1 = 100 * k / WHISPER_SAMPLE_RATE ;
if ( j < n - 1 & & tokens [ j ] . t1 > tokens [ j + 1 ] . t0 ) {
tokens [ j ] . t1 = tokens [ j + 1 ] . t0 ;
} else {
s1 = k ;
}
} else {
while ( pcm_avg [ k ] < thold & & k > s0 ) {
k - - ;
}
s1 = k ;
tokens [ j ] . t1 = 100 * k / WHISPER_SAMPLE_RATE ;
}
}
}
}
// fixed token expand (optional)
{
const int t_expand = 0 ;
for ( int j = 0 ; j < n ; j + + ) {
if ( j > 0 ) {
tokens [ j ] . t0 = std : : max ( 0 , ( int ) ( tokens [ j ] . t0 - t_expand ) ) ;
}
if ( j < n - 1 ) {
tokens [ j ] . t1 = tokens [ j ] . t1 + t_expand ;
}
}
}
// debug info
// TODO: toggle via parameter
for ( int j = 0 ; j < n ; + + j ) {
const auto & token = tokens [ j ] ;
const auto tt = token . pt > params . word_thold & & token . ptsum > 0.01 ? whisper_token_to_str ( ctx , token . tid ) : " [?] " ;
printf ( " %s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s' \n " , __func__ ,
tt , token . p , token . pt , token . ptsum , token . vlen , ( int ) token . t0 , ( int ) token . t1 , token . text . c_str ( ) ) ;
if ( tokens [ j ] . id > = whisper_token_eot ( ctx ) ) {
continue ;
}
//printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
}
// TODO: become parameters
static const int line_wrap = 60 ;
static const char * font = " /System/Library/Fonts/Supplemental/Courier New Bold.ttf " ;
if ( ! is_first ) {
if ( i > 0 ) {
fout < < " , " ;
}
// background text
fout < < " drawtext=fontfile=' " < < font < < " ':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t, " < < t0 / 100.0 < < " , " < < t0 / 100.0 < < " )' " ;
is_first = fals e;
bool is_first = true ;
for ( int j = 0 ; j < n ; + + j ) {
const auto & token = tokens [ j ] ;
@ -654,17 +380,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
}
ncnt + = txt . size ( ) ;
if ( ncnt > line_wrap ) {
if ( k < j ) {
txt_bg = " > " ;
txt_fg = " > " ;
txt_ul = " \\ \\ " ;
ncnt = 0 ;
} else {
break ;
}
}
}
: : replace_all ( txt_bg , " ' " , " ’ " ) ;
@ -673,8 +388,11 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
: : replace_all ( txt_fg , " \" " , " \\ \" " ) ;
}
// background text
fout < < " ,drawtext=fontfile=' " < < font < < " ':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text=' " < < txt_bg < < " ':enable='between(t, " < < token . tt0 / 100.0 < < " , " < < token . tt1 / 100.0 < < " )' " ;
if ( is_first ) {
// background text
fout < < " ,drawtext=fontfile=' " < < font < < " ':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text=' " < < txt_bg < < " ':enable='between(t, " < < t0 / 100.0 < < " , " < < t1 / 100.0 < < " )' " ;
is_first = false ;
}
// foreground text
fout < < " ,drawtext=fontfile=' " < < font < < " ':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text=' " < < txt_fg < < " ':enable='between(t, " < < token . t0 / 100.0 < < " , " < < token . t1 / 100.0 < < " )' " ;
@ -815,6 +533,10 @@ int main(int argc, char ** argv) {
wparams . n_max_text_ctx = params . max_context > = 0 ? params . max_context : wparams . n_max_text_ctx ;
wparams . offset_ms = params . offset_t_ms ;
wparams . token_timestamps = params . output_wts | | params . max_len > 0 ;
wparams . thold_pt = params . word_thold ;
wparams . max_len = params . output_wts & & params . max_len = = 0 ? 60 : params . max_len ;
// this callback is called on each new segment
if ( ! wparams . print_realtime ) {
wparams . new_segment_callback = whisper_print_segment_callback ;
@ -852,7 +574,7 @@ int main(int argc, char ** argv) {
// output to WTS file
if ( params . output_wts ) {
const auto fname_wts = fname_inp + " .wts " ;
output_wts ( ctx , fname_wts . c_str ( ) , fname_inp . c_str ( ) , params , pcmf32 ) ;
output_wts ( ctx , fname_wts . c_str ( ) , fname_inp . c_str ( ) , params , float ( pcmf32 . size ( ) + 1000 ) / WHISPER_SAMPLE_RATE ) ;
}
}
}