|
|
@ -287,49 +287,41 @@ void ggml_time_init(void) {
|
|
|
|
QueryPerformanceFrequency(&frequency);
|
|
|
|
QueryPerformanceFrequency(&frequency);
|
|
|
|
timer_freq = frequency.QuadPart;
|
|
|
|
timer_freq = frequency.QuadPart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int64_t ggml_time_ms(void) {
|
|
|
|
int64_t ggml_real_time_ms(void) {
|
|
|
|
LARGE_INTEGER t;
|
|
|
|
LARGE_INTEGER t;
|
|
|
|
QueryPerformanceCounter(&t);
|
|
|
|
QueryPerformanceCounter(&t);
|
|
|
|
return (t.QuadPart * 1000) / timer_freq;
|
|
|
|
return (t.QuadPart * 1000) / timer_freq;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int64_t ggml_time_us(void) {
|
|
|
|
int64_t ggml_real_time_us(void) {
|
|
|
|
LARGE_INTEGER t;
|
|
|
|
LARGE_INTEGER t;
|
|
|
|
QueryPerformanceCounter(&t);
|
|
|
|
QueryPerformanceCounter(&t);
|
|
|
|
return (t.QuadPart * 1000000) / timer_freq;
|
|
|
|
return (t.QuadPart * 1000000) / timer_freq;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
void ggml_time_init(void) {}
|
|
|
|
void ggml_time_init(void) {}
|
|
|
|
int64_t ggml_time_ms(void) {
|
|
|
|
int64_t ggml_real_time_ms(void) {
|
|
|
|
struct timespec ts;
|
|
|
|
struct timespec ts;
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
|
|
|
|
return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int64_t ggml_time_us(void) {
|
|
|
|
int64_t ggml_real_time_us(void) {
|
|
|
|
struct timespec ts;
|
|
|
|
struct timespec ts;
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
|
|
|
|
return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t ggml_cycles(void) {
|
|
|
|
int64_t ggml_process_time_ms(void) {
|
|
|
|
return clock();
|
|
|
|
struct timespec ts;
|
|
|
|
|
|
|
|
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
|
|
|
|
|
|
|
return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int64_t ggml_cycles_per_ms(void) {
|
|
|
|
int64_t ggml_process_time_us(void) {
|
|
|
|
return CLOCKS_PER_SEC/1000;
|
|
|
|
struct timespec ts;
|
|
|
|
|
|
|
|
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
|
|
|
|
|
|
|
return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef GGML_PERF
|
|
|
|
|
|
|
|
#define ggml_perf_time_ms() ggml_time_ms()
|
|
|
|
|
|
|
|
#define ggml_perf_time_us() ggml_time_us()
|
|
|
|
|
|
|
|
#define ggml_perf_cycles() ggml_cycles()
|
|
|
|
|
|
|
|
#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#define ggml_perf_time_ms() 0
|
|
|
|
|
|
|
|
#define ggml_perf_time_us() 0
|
|
|
|
|
|
|
|
#define ggml_perf_cycles() 0
|
|
|
|
|
|
|
|
#define ggml_perf_cycles_per_ms() 0
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
//
|
|
|
@ -1477,7 +1469,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
|
if (is_first_call) {
|
|
|
|
if (is_first_call) {
|
|
|
|
// initialize GELU, EXP and F32 tables
|
|
|
|
// initialize GELU, EXP and F32 tables
|
|
|
|
{
|
|
|
|
{
|
|
|
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
|
|
|
const uint64_t t_start = ggml_real_time_us(); UNUSED(t_start);
|
|
|
|
|
|
|
|
|
|
|
|
ggml_fp16_t ii;
|
|
|
|
ggml_fp16_t ii;
|
|
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
|
@ -1488,14 +1480,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
|
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
|
|
|
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
|
|
const uint64_t t_end = ggml_real_time_us(); UNUSED(t_end);
|
|
|
|
|
|
|
|
|
|
|
|
GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
|
|
GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// initialize g_state
|
|
|
|
// initialize g_state
|
|
|
|
{
|
|
|
|
{
|
|
|
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
|
|
|
const uint64_t t_start = ggml_real_time_us(); UNUSED(t_start);
|
|
|
|
|
|
|
|
|
|
|
|
g_state = (struct ggml_state) {
|
|
|
|
g_state = (struct ggml_state) {
|
|
|
|
/*.contexts =*/ { { 0 } },
|
|
|
|
/*.contexts =*/ { { 0 } },
|
|
|
@ -1505,7 +1497,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
|
g_state.contexts[i].used = false;
|
|
|
|
g_state.contexts[i].used = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
|
|
const uint64_t t_end = ggml_real_time_us(); UNUSED(t_end);
|
|
|
|
|
|
|
|
|
|
|
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
|
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -1657,8 +1649,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
|
|
/*.opt =*/ { NULL },
|
|
|
|
/*.opt =*/ { NULL },
|
|
|
|
/*.n_tasks =*/ 0,
|
|
|
|
/*.n_tasks =*/ 0,
|
|
|
|
/*.perf_runs =*/ 0,
|
|
|
|
/*.perf_runs =*/ 0,
|
|
|
|
/*.perf_cycles =*/ 0,
|
|
|
|
/*.perf_process_time_us =*/ 0,
|
|
|
|
/*.perf_time_us =*/ 0,
|
|
|
|
/*.perf_real_time_us =*/ 0,
|
|
|
|
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
|
|
|
|
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
|
|
|
|
/*.pad =*/ { 0 },
|
|
|
|
/*.pad =*/ { 0 },
|
|
|
|
};
|
|
|
|
};
|
|
|
@ -4307,7 +4299,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -4401,7 +4393,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
|
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -4531,7 +4523,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//int64_t t1 = ggml_perf_time_us();
|
|
|
|
//int64_t t1 = ggml_process_time_us();
|
|
|
|
//static int64_t acc = 0;
|
|
|
|
//static int64_t acc = 0;
|
|
|
|
//acc += t1 - t0;
|
|
|
|
//acc += t1 - t0;
|
|
|
|
//if (t1 - t0 > 10) {
|
|
|
|
//if (t1 - t0 > 10) {
|
|
|
@ -4550,7 +4542,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -4682,7 +4674,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
|
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_process_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -4836,7 +4828,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//int64_t t1 = ggml_time_us();
|
|
|
|
//int64_t t1 = ggml_real_time_us();
|
|
|
|
//static int64_t acc = 0;
|
|
|
|
//static int64_t acc = 0;
|
|
|
|
//acc += t1 - t0;
|
|
|
|
//acc += t1 - t0;
|
|
|
|
//if (t1 - t0 > 10) {
|
|
|
|
//if (t1 - t0 > 10) {
|
|
|
@ -5304,7 +5296,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -5424,7 +5416,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -5570,7 +5562,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -5690,7 +5682,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
|
const int ne00 = src0->ne[0];
|
|
|
@ -5834,7 +5826,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
|
|
const struct ggml_tensor * v,
|
|
|
|
const struct ggml_tensor * v,
|
|
|
|
const bool masked,
|
|
|
|
const bool masked,
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int neq0 = q->ne[0];
|
|
|
|
const int neq0 = q->ne[0];
|
|
|
@ -6043,7 +6035,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
|
|
const struct ggml_tensor * v,
|
|
|
|
const struct ggml_tensor * v,
|
|
|
|
const bool masked,
|
|
|
|
const bool masked,
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int neq0 = q->ne[0];
|
|
|
|
const int neq0 = q->ne[0];
|
|
|
@ -6318,7 +6310,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
|
|
const struct ggml_tensor * c0, // F16 proj_w
|
|
|
|
const struct ggml_tensor * c0, // F16 proj_w
|
|
|
|
const struct ggml_tensor * c1, // F32 proj_b
|
|
|
|
const struct ggml_tensor * c1, // F32 proj_b
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
int64_t t0 = ggml_perf_time_us();
|
|
|
|
int64_t t0 = ggml_process_time_us();
|
|
|
|
UNUSED(t0);
|
|
|
|
UNUSED(t0);
|
|
|
|
|
|
|
|
|
|
|
|
const int nea0 = a->ne[0];
|
|
|
|
const int nea0 = a->ne[0];
|
|
|
@ -6997,8 +6989,8 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
|
|
/*.grads =*/ { NULL },
|
|
|
|
/*.grads =*/ { NULL },
|
|
|
|
/*.leafs =*/ { NULL },
|
|
|
|
/*.leafs =*/ { NULL },
|
|
|
|
/*.perf_runs =*/ 0,
|
|
|
|
/*.perf_runs =*/ 0,
|
|
|
|
/*.perf_cycles =*/ 0,
|
|
|
|
/*.perf_process_time_us =*/ 0,
|
|
|
|
/*.perf_time_us =*/ 0,
|
|
|
|
/*.perf_real_time_us =*/ 0,
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
ggml_build_forward_impl(&result, tensor, false);
|
|
|
|
ggml_build_forward_impl(&result, tensor, false);
|
|
|
@ -7404,8 +7396,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
|
|
|
const int64_t perf_process_time_start_us = ggml_process_time_us();
|
|
|
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
|
|
|
const int64_t perf_real_time_start_us = ggml_real_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
|
|
|
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
|
|
|
@ -7417,8 +7409,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
// continue;
|
|
|
|
// continue;
|
|
|
|
//}
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
|
|
|
const int64_t perf_node_process_time_start_us = ggml_process_time_us();
|
|
|
|
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
|
|
|
const int64_t perf_node_real_time_start_us = ggml_real_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
// INIT
|
|
|
|
// INIT
|
|
|
|
struct ggml_compute_params params = {
|
|
|
|
struct ggml_compute_params params = {
|
|
|
@ -7543,12 +7535,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
|
|
|
|
|
|
|
|
// performance stats (node)
|
|
|
|
// performance stats (node)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
|
|
|
int64_t perf_cur_process_time_us = ggml_process_time_us() - perf_node_process_time_start_us;
|
|
|
|
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
|
|
|
int64_t perf_cur_real_time_us = ggml_real_time_us() - perf_node_real_time_start_us;
|
|
|
|
|
|
|
|
|
|
|
|
node->perf_runs++;
|
|
|
|
node->perf_runs++;
|
|
|
|
node->perf_cycles += perf_cycles_cur;
|
|
|
|
node->perf_process_time_us += perf_cur_process_time_us;
|
|
|
|
node->perf_time_us += perf_time_us_cur;
|
|
|
|
node->perf_real_time_us += perf_cur_real_time_us;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -7568,19 +7560,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
|
|
|
|
|
|
|
|
// performance stats (graph)
|
|
|
|
// performance stats (graph)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
|
|
|
int64_t perf_cur_process_time_us = ggml_process_time_us() - perf_process_time_start_us;
|
|
|
|
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
|
|
|
|
int64_t perf_cur_real_time_us = ggml_real_time_us() - perf_real_time_start_us;
|
|
|
|
|
|
|
|
|
|
|
|
cgraph->perf_runs++;
|
|
|
|
cgraph->perf_runs++;
|
|
|
|
cgraph->perf_cycles += perf_cycles_cur;
|
|
|
|
cgraph->perf_process_time_us += perf_cur_process_time_us;
|
|
|
|
cgraph->perf_time_us += perf_time_us_cur;
|
|
|
|
cgraph->perf_real_time_us += perf_cur_real_time_us;
|
|
|
|
|
|
|
|
|
|
|
|
GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
|
|
|
|
GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
|
|
|
|
__func__, cgraph->perf_runs,
|
|
|
|
__func__, cgraph->perf_runs,
|
|
|
|
(double) perf_cycles_cur / (double) ggml_cycles_per_ms(),
|
|
|
|
(double) perf_cur_process_time_us / 1000.0,
|
|
|
|
(double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
|
|
|
|
(double) cgraph->perf_process_time_us / 1000.0 / (double) cgraph->perf_runs,
|
|
|
|
(double) perf_time_us_cur / 1000.0,
|
|
|
|
(double) perf_cur_real_time_us / 1000.0,
|
|
|
|
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
|
|
|
|
(double) cgraph->perf_real_time_us / 1000.0 / (double) cgraph->perf_runs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -7606,16 +7598,16 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
struct ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
struct ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
|
|
|
|
|
|
|
|
perf_total_per_op_us[node->op] += node->perf_time_us;
|
|
|
|
perf_total_per_op_us[node->op] += node->perf_real_time_us;
|
|
|
|
|
|
|
|
|
|
|
|
GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
|
|
|
GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
|
|
|
i,
|
|
|
|
i,
|
|
|
|
node->ne[0], node->ne[1], node->ne[2],
|
|
|
|
node->ne[0], node->ne[1], node->ne[2],
|
|
|
|
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
|
|
|
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
|
|
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
|
|
|
(double) node->perf_process_time_us / 1000.0,
|
|
|
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
|
|
|
(double) node->perf_process_time_us / 1000.0 / (double) node->perf_runs,
|
|
|
|
(double) node->perf_time_us / 1000.0,
|
|
|
|
(double) node->perf_real_time_us / 1000.0,
|
|
|
|
(double) node->perf_time_us / 1000.0 / node->perf_runs);
|
|
|
|
(double) node->perf_real_time_us / 1000.0 / node->perf_runs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
|
|
|
|
GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
|
|
|
@ -7894,10 +7886,10 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
|
ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
|
|
|
|
ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const int64_t t_start_wall = ggml_time_us();
|
|
|
|
const int64_t t_real_start_us = ggml_real_time_us();
|
|
|
|
const int64_t t_start_cpu = ggml_cycles();
|
|
|
|
const int64_t t_process_start_us = ggml_process_time_us();
|
|
|
|
UNUSED(t_start_wall);
|
|
|
|
UNUSED(t_real_start_us);
|
|
|
|
UNUSED(t_start_cpu);
|
|
|
|
UNUSED(t_process_start_us);
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// update the gradient
|
|
|
|
// update the gradient
|
|
|
@ -7977,13 +7969,13 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
|
fx_prev = fx;
|
|
|
|
fx_prev = fx;
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
const int64_t t_end_cpu = ggml_cycles();
|
|
|
|
const int64_t t_process_end_us = ggml_process_time_us();
|
|
|
|
GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
|
|
|
|
GGML_PRINT_DEBUG("time iter: %5.3f s\n", (t_process_end_us - t_process_start_us)/1e6);
|
|
|
|
UNUSED(t_end_cpu);
|
|
|
|
UNUSED(t_process_end_us);
|
|
|
|
|
|
|
|
|
|
|
|
const int64_t t_end_wall = ggml_time_us();
|
|
|
|
const int64_t t_real_end_us = ggml_real_time_us();
|
|
|
|
GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
|
|
|
|
GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_real_end_us - t_real_start_us)/1e6);
|
|
|
|
UNUSED(t_end_wall);
|
|
|
|
UNUSED(t_real_end_us);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|