@ -467,12 +467,18 @@ struct whisper_decoder {
} ;
struct whisper_context {
int64_t t_load_us = 0 ;
int64_t t_mel_us = 0 ;
int64_t t_sample_us = 0 ;
int64_t t_encode_us = 0 ;
int64_t t_decode_us = 0 ;
int64_t t_start_us = 0 ;
int64_t t_real_load_us = 0 ;
int64_t t_process_load_us = 0 ;
int64_t t_real_mel_us = 0 ;
int64_t t_process_mel_us = 0 ;
int64_t t_real_sample_us = 0 ;
int64_t t_process_sample_us = 0 ;
int64_t t_real_encode_us = 0 ;
int64_t t_process_encode_us = 0 ;
int64_t t_real_decode_us = 0 ;
int64_t t_process_decode_us = 0 ;
int64_t t_real_start_us = 0 ;
int64_t t_process_start_us = 0 ;
ggml_type wtype ; // weight type (FP32 or FP16)
@ -597,9 +603,11 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
static bool whisper_model_load ( struct whisper_model_loader * loader , whisper_context & wctx ) {
fprintf ( stderr , " %s: loading model \n " , __func__ ) ;
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_real_start_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_us = ggml_process_time_us ( ) ;
wctx . t_start_us = t_start_us ;
wctx . t_real_start_us = t_real_start_us ;
wctx . t_process_start_us = t_process_start_us ;
auto & model = wctx . model ;
auto & vocab = wctx . vocab ;
@ -1208,7 +1216,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
wctx . rng = std : : mt19937 ( 0 ) ;
wctx . t_load_us = ggml_time_us ( ) - t_start_us ;
wctx . t_real_load_us = ggml_real_time_us ( ) - t_real_start_us ;
wctx . t_process_load_us = ggml_process_time_us ( ) - t_process_start_us ;
return true ;
}
@ -1226,7 +1235,8 @@ static bool whisper_encode(
whisper_context & wctx ,
const int mel_offset ,
const int n_threads ) {
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_real_start_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_us = ggml_process_time_us ( ) ;
const auto & model = wctx . model ;
const auto & mel_inp = wctx . mel ;
@ -1619,7 +1629,8 @@ static bool whisper_encode(
ggml_free ( ctx0 ) ;
wctx . t_encode_us + = ggml_time_us ( ) - t_start_us ;
wctx . t_real_encode_us + = ggml_real_time_us ( ) - t_real_start_us ;
wctx . t_process_encode_us + = ggml_process_time_us ( ) - t_process_start_us ;
return true ;
}
@ -1641,7 +1652,8 @@ static bool whisper_decode(
const int n_tokens ,
const int n_past ,
const int n_threads ) {
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_real_start_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_us = ggml_process_time_us ( ) ;
const auto & model = wctx . model ;
const auto & hparams = model . hparams ;
@ -1992,7 +2004,8 @@ static bool whisper_decode(
ggml_free ( ctx0 ) ;
wctx . t_decode_us + = ggml_time_us ( ) - t_start_us ;
wctx . t_real_decode_us + = ggml_real_time_us ( ) - t_real_start_us ;
wctx . t_process_decode_us + = ggml_process_time_us ( ) - t_process_start_us ;
return true ;
}
@ -2107,7 +2120,8 @@ static bool log_mel_spectrogram(
const whisper_filters & filters ,
const bool speed_up ,
whisper_mel & mel ) {
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_real_start_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_us = ggml_process_time_us ( ) ;
// Hanning window
std : : vector < float > hann ;
@ -2216,7 +2230,8 @@ static bool log_mel_spectrogram(
mel . data [ i ] = ( mel . data [ i ] + 4.0 ) / 4.0 ;
}
wctx . t_mel_us + = ggml_time_us ( ) - t_start_us ;
wctx . t_real_mel_us = ggml_real_time_us ( ) - t_real_start_us ;
wctx . t_process_mel_us = ggml_process_time_us ( ) - t_process_start_us ;
return true ;
}
@ -2642,21 +2657,29 @@ whisper_token whisper_token_transcribe(void) {
}
void whisper_print_timings ( struct whisper_context * ctx ) {
const int64_t t_end_us = ggml_real_time_us ( ) ;
const int64_t t_real_end_us = ggml_real_time_us ( ) ;
const int64_t t_process_end_us = ggml_process_time_us ( ) ;
fprintf ( stderr , " \n " ) ;
fprintf ( stderr , " %s: load time = %8.2f ms \n " , __func__ , ctx - > t_load_us / 1000.0f ) ;
fprintf ( stderr , " %s: mel time = %8.2f ms \n " , __func__ , ctx - > t_mel_us / 1000.0f ) ;
fprintf ( stderr , " %s: sample time = %8.2f ms \n " , __func__ , ctx - > t_sample_us / 1000.0f ) ;
fprintf ( stderr , " %s: encode time = %8.2f ms / %.2f ms per layer \n " , __func__ , ctx - > t_encode_us / 1000.0f , ctx - > t_encode_us / 1000.0f / ctx - > model . hparams . n_audio_layer ) ;
fprintf ( stderr , " %s: decode time = %8.2f ms / %.2f ms per layer \n " , __func__ , ctx - > t_decode_us / 1000.0f , ctx - > t_decode_us / 1000.0f / ctx - > model . hparams . n_text_layer ) ;
fprintf ( stderr , " %s: total time = %8.2f ms \n " , __func__ , ( t_end_us - ctx - > t_start_us ) / 1000.0f ) ;
fprintf ( stderr , " %s: load time = %8.2f ms (real %8.2f ms) \n " , __func__ , ctx - > t_process_load_us / 1000.0f , ctx - > t_real_load_us / 1000.0f ) ;
fprintf ( stderr , " %s: mel time = %8.2f ms (real %8.2f ms) \n " , __func__ , ctx - > t_process_mel_us / 1000.0f , ctx - > t_real_mel_us / 1000.0f ) ;
fprintf ( stderr , " %s: sample time = %8.2f ms (real %8.2f ms) \n " , __func__ , ctx - > t_process_sample_us / 1000.0f , ctx - > t_real_sample_us / 1000.0f ) ;
fprintf ( stderr , " %s: encode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer) \n " , __func__ ,
ctx - > t_process_encode_us / 1000.0f , ctx - > t_process_encode_us / 1000.0f / ctx - > model . hparams . n_audio_layer ,
ctx - > t_real_encode_us / 1000.0f , ctx - > t_real_encode_us / 1000.0f / ctx - > model . hparams . n_audio_layer ) ;
fprintf ( stderr , " %s: decode time = %8.2f ms / %.2f ms per layer (real %8.2f ms / %.2f ms per layer) \n " , __func__ ,
ctx - > t_process_decode_us / 1000.0f , ctx - > t_process_decode_us / 1000.0f / ctx - > model . hparams . n_text_layer ,
ctx - > t_real_decode_us / 1000.0f , ctx - > t_real_decode_us / 1000.0f / ctx - > model . hparams . n_text_layer ) ;
fprintf ( stderr , " %s: total time = %8.2f ms (real %8.2f ms) \n " , __func__ , ( t_process_end_us - ctx - > t_process_start_us ) / 1000.f , ( t_real_end_us - ctx - > t_real_start_us ) / 1000.0f ) ;
}
void whisper_reset_timings ( struct whisper_context * ctx ) {
ctx - > t_sample_us = 0 ;
ctx - > t_encode_us = 0 ;
ctx - > t_decode_us = 0 ;
ctx - > t_real_sample_us = 0 ;
ctx - > t_process_sample_us = 0 ;
ctx - > t_real_encode_us = 0 ;
ctx - > t_process_encode_us = 0 ;
ctx - > t_real_decode_us = 0 ;
ctx - > t_process_decode_us = 0 ;
}
const char * whisper_print_system_info ( void ) {
@ -3455,7 +3478,8 @@ int whisper_full(
}
{
const int64_t t_start_sample_us = ggml_time_us ( ) ;
const int64_t t_real_start_sample_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_sample_us = ggml_process_time_us ( ) ;
whisper_process_logits ( * ctx , params , ctx - > decoders [ 0 ] , t_cur ) ;
@ -3474,12 +3498,14 @@ int whisper_full(
memcpy ( decoder . logprobs . data ( ) , ctx - > decoders [ 0 ] . logprobs . data ( ) , decoder . logprobs . size ( ) * sizeof ( decoder . logprobs [ 0 ] ) ) ;
}
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > t_real_sample_us + = ggml_real_time_us ( ) - t_real_start_sample_us ;
ctx - > t_process_sample_us + = ggml_process_time_us ( ) - t_process_start_sample_us ;
}
}
for ( int i = 0 , n_max = whisper_n_text_ctx ( ctx ) / 2 - 4 ; i < n_max ; + + i ) {
const int64_t t_start_sample_us = ggml_time_us ( ) ;
const int64_t t_real_start_sample_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_sample_us = ggml_process_time_us ( ) ;
// store the KV caches of all decoders when doing beam-search
if ( params . strategy = = whisper_sampling_strategy : : WHISPER_SAMPLING_BEAM_SEARCH ) {
@ -3672,7 +3698,8 @@ int whisper_full(
}
}
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > t_real_sample_us + = ggml_real_time_us ( ) - t_real_start_sample_us ;
ctx - > t_process_sample_us + = ggml_process_time_us ( ) - t_process_start_sample_us ;
// obtain logits for the next token
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
@ -3693,13 +3720,15 @@ int whisper_full(
}
{
const int64_t t_start_sample_us = ggml_time_us ( ) ;
const int64_t t_real_start_sample_us = ggml_real_time_us ( ) ;
const int64_t t_process_start_sample_us = ggml_process_time_us ( ) ;
whisper_process_logits ( * ctx , params , decoder , t_cur ) ;
+ + decoder . kv_self . n ;
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > t_real_sample_us + = ggml_real_time_us ( ) - t_real_start_sample_us ;
ctx - > t_process_sample_us + = ggml_process_time_us ( ) - t_process_start_sample_us ;
}
}
}
@ -3992,10 +4021,14 @@ int whisper_full_parallel(
}
}
ctx - > t_mel_us + = ctxs [ i ] . t_mel_us ;
ctx - > t_sample_us + = ctxs [ i ] . t_sample_us ;
ctx - > t_encode_us + = ctxs [ i ] . t_encode_us ;
ctx - > t_decode_us + = ctxs [ i ] . t_decode_us ;
ctx - > t_real_mel_us + = ctxs [ i ] . t_real_mel_us ;
ctx - > t_process_mel_us + = ctxs [ i ] . t_process_mel_us ;
ctx - > t_real_sample_us + = ctxs [ i ] . t_real_sample_us ;
ctx - > t_process_sample_us + = ctxs [ i ] . t_process_sample_us ;
ctx - > t_real_encode_us + = ctxs [ i ] . t_real_encode_us ;
ctx - > t_process_encode_us + = ctxs [ i ] . t_process_encode_us ;
ctx - > t_real_decode_us + = ctxs [ i ] . t_real_decode_us ;
ctx - > t_process_decode_us + = ctxs [ i ] . t_process_decode_us ;
kv_cache_free ( ctx - > kv_cross ) ;
@ -4005,10 +4038,14 @@ int whisper_full_parallel(
}
// average the timings
ctx - > t_mel_us / = n_processors ;
ctx - > t_sample_us / = n_processors ;
ctx - > t_encode_us / = n_processors ;
ctx - > t_decode_us / = n_processors ;
ctx - > t_real_mel_us / = n_processors ;
ctx - > t_process_mel_us / = n_processors ;
ctx - > t_real_sample_us / = n_processors ;
ctx - > t_process_sample_us / = n_processors ;
ctx - > t_real_encode_us / = n_processors ;
ctx - > t_process_encode_us / = n_processors ;
ctx - > t_real_decode_us / = n_processors ;
ctx - > t_process_decode_us / = n_processors ;
// print information about the audio boundaries
fprintf ( stderr , " \n " ) ;