@ -282,54 +282,83 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
# if defined(_MSC_VER) || defined(__MINGW32__)
static int64_t timer_freq ;
static HANDLE current_process_handle ;
void ggml_time_init ( void ) {
LARGE_INTEGER frequency ;
QueryPerformanceFrequency ( & frequency ) ;
timer_freq = frequency . QuadPart ;
current_process_handle = GetCurrentProcess ( ) ;
}
int64_t ggml_time_ms ( void ) {
int64_t ggml_time_real_ms ( void ) {
LARGE_INTEGER t ;
QueryPerformanceCounter ( & t ) ;
return ( t . QuadPart * 1000 ) / timer_freq ;
}
int64_t ggml_time_us ( void ) {
int64_t ggml_time_real_us ( void ) {
LARGE_INTEGER t ;
QueryPerformanceCounter ( & t ) ;
return ( t . QuadPart * 1000000 ) / timer_freq ;
}
// Query only user time for process times as CLOCK_PROCESS_CPUTIME_ID does not include kernel time on Unix systems.
int64_t ggml_time_proc_ms ( void ) {
FILETIME user_time ;
GetProcessTimes ( current_process_handle , NULL , NULL , NULL , & user_time ) ;
ULARGE_INTEGER t ;
t . u . LowPart = user_time . dwLowDateTime ;
t . u . HighPart = user_time . dwHighDateTime ;
return t . QuadPart / 10000 ;
}
int64_t ggml_time_proc_us ( void ) {
FILETIME user_time ;
GetProcessTimes ( current_process_handle , NULL , NULL , NULL , & user_time ) ;
ULARGE_INTEGER t ;
t . u . LowPart = user_time . dwLowDateTime ;
t . u . HighPart = user_time . dwHighDateTime ;
return t . QuadPart / 10 ;
}
# else
void ggml_time_init ( void ) { }
int64_t ggml_time_ms ( void ) {
int64_t ggml_time_ real_ ms( void ) {
struct timespec ts ;
clock_gettime ( CLOCK_MONOTONIC , & ts ) ;
return ( int64_t ) ts . tv_sec * 1000 + ( int64_t ) ts . tv_nsec / 1000000 ;
}
int64_t ggml_time_us ( void ) {
int64_t ggml_time_ real_ us( void ) {
struct timespec ts ;
clock_gettime ( CLOCK_MONOTONIC , & ts ) ;
return ( int64_t ) ts . tv_sec * 1000000 + ( int64_t ) ts . tv_nsec / 1000 ;
}
# endif
int64_t ggml_cycles ( void ) {
return clock ( ) ;
int64_t ggml_time_proc_ms ( void ) {
struct timespec ts ;
clock_gettime ( CLOCK_PROCESS_CPUTIME_ID , & ts ) ;
return ( int64_t ) ts . tv_sec * 1000 + ( int64_t ) ts . tv_nsec / 1000000 ;
}
int64_t ggml_cycles_per_ms ( void ) {
return CLOCKS_PER_SEC / 1000 ;
int64_t ggml_time_proc_us ( void ) {
struct timespec ts ;
clock_gettime ( CLOCK_PROCESS_CPUTIME_ID , & ts ) ;
return ( int64_t ) ts . tv_sec * 1000000 + ( int64_t ) ts . tv_nsec / 1000 ;
}
# endif
# ifdef GGML_PERF
# define ggml_perf_time_ms() ggml_time_ms()
# define ggml_perf_time_us() ggml_time_us()
# define ggml_perf_cycles() ggml_cycles()
# define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
# define GGML_PERF_TIME_REAL_US() ggml_time_real_us()
# define GGML_PERF_TIME_PROC_US() ggml_time_proc_us()
# else
# define ggml_perf_time_ms() 0
# define ggml_perf_time_us() 0
# define ggml_perf_cycles() 0
# define ggml_perf_cycles_per_ms() 0
# define GGML_PERF_TIME_REAL_US() 0
# define GGML_PERF_TIME_PROC_US() 0
# endif
//
@ -1477,7 +1506,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
if ( is_first_call ) {
// initialize GELU, EXP and F32 tables
{
const uint64_t t_start = ggml_time_us ( ) ; UNUSED ( t_start ) ;
const uint64_t t_start = GGML_PERF_TIME_REAL_US ( ) ; UNUSED ( t_start ) ;
ggml_fp16_t ii ;
for ( int i = 0 ; i < ( 1 < < 16 ) ; + + i ) {
@ -1488,14 +1517,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
table_exp_f16 [ i ] = GGML_FP32_TO_FP16 ( exp ( f ) ) ;
}
const uint64_t t_end = ggml_time_us ( ) ; UNUSED ( t_end ) ;
const uint64_t t_end = GGML_PERF_TIME_REAL_US ( ) ; UNUSED ( t_end ) ;
GGML_PRINT_DEBUG ( " %s: GELU and EXP tables initialized in %f ms \n " , __func__ , ( t_end - t_start ) / 1000.0f ) ;
}
// initialize g_state
{
const uint64_t t_start = ggml_time_us ( ) ; UNUSED ( t_start ) ;
const uint64_t t_start = GGML_PERF_TIME_REAL_US ( ) ; UNUSED ( t_start ) ;
g_state = ( struct ggml_state ) {
/*.contexts =*/ { { 0 } } ,
@ -1505,7 +1534,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
g_state . contexts [ i ] . used = false ;
}
const uint64_t t_end = ggml_time_us ( ) ; UNUSED ( t_end ) ;
const uint64_t t_end = GGML_PERF_TIME_REAL_US ( ) ; UNUSED ( t_end ) ;
GGML_PRINT_DEBUG ( " %s: g_state initialized in %f ms \n " , __func__ , ( t_end - t_start ) / 1000.0f ) ;
}
@ -1657,8 +1686,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.opt =*/ { NULL } ,
/*.n_tasks =*/ 0 ,
/*.perf_runs =*/ 0 ,
/*.perf_ cycles =*/ 0 ,
/*.perf_time_ us =*/ 0 ,
/*.perf_ time_proc_us =*/ 0 ,
/*.perf_time_ real_ us =*/ 0 ,
/*.data =*/ data = = NULL ? ( void * ) ( result + 1 ) : data ,
/*.pad =*/ { 0 } ,
} ;
@ -4307,7 +4336,7 @@ static void ggml_compute_forward_mul_mat_f32(
const struct ggml_tensor * src0 ,
const struct ggml_tensor * src1 ,
struct ggml_tensor * dst ) {
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -4403,7 +4432,7 @@ static void ggml_compute_forward_mul_mat_f32(
}
}
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", ( ggml_perf_time_us () - t0)/1000.0, ne0, ne1, ne2, ne3);
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", ( GGML_PERF_TIME_PROC_US () - t0)/1000.0, ne0, ne1, ne2, ne3);
return ;
}
@ -4533,7 +4562,7 @@ static void ggml_compute_forward_mul_mat_f32(
}
}
//int64_t t1 = ggml_perf_time_us ();
//int64_t t1 = GGML_PERF_TIME_PROC_US ();
//static int64_t acc = 0;
//acc += t1 - t0;
//if (t1 - t0 > 10) {
@ -4552,7 +4581,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
const struct ggml_tensor * src0 ,
const struct ggml_tensor * src1 ,
struct ggml_tensor * dst ) {
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -4686,7 +4715,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
}
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", ( ggml_perf_time_us () - t0)/1000.0, ne0, ne1, ne2, ne3);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", ( GGML_PERF_TIME_PROC_US () - t0)/1000.0, ne0, ne1, ne2, ne3);
return ;
}
@ -4840,7 +4869,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
}
//int64_t t1 = ggml_time_us ();
//int64_t t1 = GGML_PERF_TIME_REAL_US ();
//static int64_t acc = 0;
//acc += t1 - t0;
//if (t1 - t0 > 10) {
@ -5308,7 +5337,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
GGML_ASSERT ( src1 - > type = = GGML_TYPE_F32 ) ;
GGML_ASSERT ( dst - > type = = GGML_TYPE_F32 ) ;
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -5428,7 +5457,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
GGML_ASSERT ( src1 - > type = = GGML_TYPE_F32 ) ;
GGML_ASSERT ( dst - > type = = GGML_TYPE_F32 ) ;
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -5574,7 +5603,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
GGML_ASSERT ( src1 - > type = = GGML_TYPE_F32 ) ;
GGML_ASSERT ( dst - > type = = GGML_TYPE_F32 ) ;
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -5694,7 +5723,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
GGML_ASSERT ( src1 - > type = = GGML_TYPE_F32 ) ;
GGML_ASSERT ( dst - > type = = GGML_TYPE_F32 ) ;
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int ne00 = src0 - > ne [ 0 ] ;
@ -5838,7 +5867,7 @@ static void ggml_compute_forward_flash_attn_f32(
const struct ggml_tensor * v ,
const bool masked ,
struct ggml_tensor * dst ) {
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int neq0 = q - > ne [ 0 ] ;
@ -6047,7 +6076,7 @@ static void ggml_compute_forward_flash_attn_f16(
const struct ggml_tensor * v ,
const bool masked ,
struct ggml_tensor * dst ) {
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int neq0 = q - > ne [ 0 ] ;
@ -6322,7 +6351,7 @@ static void ggml_compute_forward_flash_ff_f16(
const struct ggml_tensor * c0 , // F16 proj_w
const struct ggml_tensor * c1 , // F32 proj_b
struct ggml_tensor * dst ) {
int64_t t0 = ggml_perf_time_us ( ) ;
int64_t t0 = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t0 ) ;
const int nea0 = a - > ne [ 0 ] ;
@ -7001,8 +7030,8 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
/*.grads =*/ { NULL } ,
/*.leafs =*/ { NULL } ,
/*.perf_runs =*/ 0 ,
/*.perf_ cycles =*/ 0 ,
/*.perf_time_ us =*/ 0 ,
/*.perf_ time_proc_us =*/ 0 ,
/*.perf_time_ real_ us =*/ 0 ,
} ;
ggml_build_forward_impl ( & result , tensor , false ) ;
@ -7411,8 +7440,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
}
}
const int64_t perf_ start_cycles = ggml_perf_cycles ( ) ;
const int64_t perf_ start_time_us = ggml_perf_time_us ( ) ;
const int64_t perf_ time_proc_start_us = GGML_PERF_TIME_PROC_US ( ) ;
const int64_t perf_ time_real_start_us = GGML_PERF_TIME_REAL_US ( ) ;
for ( int i = 0 ; i < cgraph - > n_nodes ; i + + ) {
GGML_PRINT_DEBUG_5 ( " %s: %d/%d \n " , __func__ , i , cgraph - > n_nodes ) ;
@ -7424,8 +7453,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// continue;
//}
const int64_t perf_node_ start_cycles = ggml_perf_cycles ( ) ;
const int64_t perf_node_ start_time_us = ggml_perf_time_us ( ) ;
const int64_t perf_node_ time_proc_start_us = GGML_PERF_TIME_PROC_US ( ) ;
const int64_t perf_node_ time_real_start_us = GGML_PERF_TIME_REAL_US ( ) ;
// INIT
struct ggml_compute_params params = {
@ -7550,12 +7579,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// performance stats (node)
{
int64_t perf_c ycles_cur = ggml_perf_cycles ( ) - perf_node_start_cycle s;
int64_t perf_ time_us_cur = ggml_perf_time_us ( ) - perf_node_start_time _us;
int64_t perf_c ur_time_proc_us = GGML_PERF_TIME_PROC_US ( ) - perf_node_time_proc_start_u s;
int64_t perf_ cur_time_real_us = GGML_PERF_TIME_REAL_US ( ) - perf_node_time_real_start _us;
node - > perf_runs + + ;
node - > perf_ cycles + = perf_cycles_cur ;
node - > perf_time_ us + = perf _time_us_cur ;
node - > perf_ time_proc_us + = perf_cur_time_proc_us ;
node - > perf_time_ real_ us + = perf _cur _time_real_ us;
}
}
@ -7575,19 +7604,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// performance stats (graph)
{
int64_t perf_c ycles_cur = ggml_perf_cycles ( ) - perf_start_cycle s;
int64_t perf_ time_us_cur = ggml_perf_time_us ( ) - perf_start_time _us;
int64_t perf_c ur_time_proc_us = GGML_PERF_TIME_PROC_US ( ) - perf_time_proc_start_u s;
int64_t perf_ cur_time_real_us = GGML_PERF_TIME_REAL_US ( ) - perf_time_real_start _us;
cgraph - > perf_runs + + ;
cgraph - > perf_ cycles + = perf_cycles_cur ;
cgraph - > perf_time_ us + = perf _time_us_cur ;
cgraph - > perf_ time_proc_us + = perf_cur_time_proc_us ;
cgraph - > perf_time_ real_ us + = perf _cur _time_real_ us;
GGML_PRINT_DEBUG ( " %s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms \n " ,
__func__ , cgraph - > perf_runs ,
( double ) perf_c ycles_cur / ( double ) ggml_cycles_per_ms ( ) ,
( double ) cgraph - > perf_ cycles / ( double ) ggml_cycles_per_ms ( ) / ( double ) cgraph - > perf_runs ,
( double ) perf_ time_us_cur / 1000.0 ,
( double ) cgraph - > perf_time_ us / 1000.0 / cgraph - > perf_runs ) ;
( double ) perf_c ur_time_proc_us / 1000.0 ,
( double ) cgraph - > perf_ time_proc_us / 1000.0 / ( double ) cgraph - > perf_runs ,
( double ) perf_ cur_time_real_us / 1000.0 ,
( double ) cgraph - > perf_time_ real_ us / 1000.0 / ( double ) cgraph - > perf_runs ) ;
}
}
@ -7613,16 +7642,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
for ( int i = 0 ; i < cgraph - > n_nodes ; i + + ) {
struct ggml_tensor * node = cgraph - > nodes [ i ] ;
perf_total_per_op_us [ node - > op ] + = node - > perf_time_ us;
perf_total_per_op_us [ node - > op ] + = node - > perf_time_ real_ us;
GGML_PRINT ( " - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms \n " ,
i ,
node - > ne [ 0 ] , node - > ne [ 1 ] , node - > ne [ 2 ] ,
GGML_OP_LABEL [ node - > op ] , node - > is_param ? " x " : node - > grad ? " g " : " " , node - > perf_runs ,
( double ) node - > perf_ cycles / ( double ) ggml_cycles_per_ms ( ) ,
( double ) node - > perf_ cycles / ( double ) ggml_cycles_per_ms ( ) / ( double ) node - > perf_runs ,
( double ) node - > perf_time_ us / 1000.0 ,
( double ) node - > perf_time_ us / 1000.0 / node - > perf_runs ) ;
( double ) node - > perf_ time_proc_us / 1000.0 ,
( double ) node - > perf_ time_proc_us / 1000.0 / ( double ) node - > perf_runs ,
( double ) node - > perf_time_ real_ us / 1000.0 ,
( double ) node - > perf_time_ real_ us / 1000.0 / node - > perf_runs ) ;
}
GGML_PRINT ( " n_leafs = %d \n " , cgraph - > n_leafs ) ;
@ -7901,10 +7930,10 @@ static enum ggml_opt_result ggml_opt_adam(
ggml_get_f32_1d ( ps [ i ] , 0 ) , ggml_get_f32_1d ( ps [ i ] - > grad , 0 ) ) ;
}
const int64_t t_ start_wall = ggml_time_us ( ) ;
const int64_t t_ start_cpu = ggml_cycles ( ) ;
UNUSED ( t_ start_wall ) ;
UNUSED ( t_ start_cp u) ;
const int64_t t_ real_start_us = GGML_PERF_TIME_REAL_US ( ) ;
const int64_t t_ process_start_us = GGML_PERF_TIME_PROC_US ( ) ;
UNUSED ( t_ real_start_us ) ;
UNUSED ( t_ process_ start_us ) ;
{
// update the gradient
@ -7984,13 +8013,13 @@ static enum ggml_opt_result ggml_opt_adam(
fx_prev = fx ;
{
const int64_t t_ end_cpu = ggml_cycles ( ) ;
GGML_PRINT_DEBUG ( " time iter: %5.3f s \n " , ( ( float ) ( t_end_cpu - t_start_cpu ) ) / CLOCKS_PER_SEC ) ;
UNUSED ( t_ end_cp u) ;
const int64_t t_ process_end_us = GGML_PERF_TIME_PROC_US ( ) ;
GGML_PRINT_DEBUG ( " time iter: %5.3f s \n " , ( t_process_end_us - t_process_start_us ) / 1e6 ) ;
UNUSED ( t_ process_ end_us ) ;
const int64_t t_ end_wall = ggml_time_us ( ) ;
GGML_PRINT_DEBUG ( " wall time iter: %5.3f s \n " , ( t_ end_wall - t_start_wall ) / 1e6 ) ;
UNUSED ( t_ end_wall ) ;
const int64_t t_ real_end_us = GGML_PERF_TIME_REAL_US ( ) ;
GGML_PRINT_DEBUG ( " wall time iter: %5.3f s \n " , ( t_ real_end_us - t_real_start_us ) / 1e6 ) ;
UNUSED ( t_ real_end_us ) ;
}
}