ggml : support for scratch ring-buffer

pull/431/head
Georgi Gerganov 2 years ago
parent 60eff46b0a
commit 0eea547ab3
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -1258,7 +1258,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
// //
struct ggml_object { struct ggml_object {
size_t offset; size_t offs;
size_t size; size_t size;
struct ggml_object * next; struct ggml_object * next;
@ -1348,7 +1348,7 @@ inline static void ggml_critical_section_end(void) {
void ggml_print_object(const struct ggml_object * obj) { void ggml_print_object(const struct ggml_object * obj) {
GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n", GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
obj->offset, obj->size, (const void *) obj->next); obj->offs, obj->size, (const void *) obj->next);
} }
void ggml_print_objects(const struct ggml_context * ctx) { void ggml_print_objects(const struct ggml_context * ctx) {
@ -1550,7 +1550,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.n_objects =*/ 0, /*.n_objects =*/ 0,
/*.objects_begin =*/ NULL, /*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL, /*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, 0, NULL }, /*.scratch =*/ { 0, 0, NULL, },
}; };
ggml_assert_aligned(ctx->mem_buffer); ggml_assert_aligned(ctx->mem_buffer);
@ -1573,7 +1573,7 @@ void ggml_free(struct ggml_context * ctx) {
g_state.contexts[i].used = false; g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offset + ctx->objects_end->size); __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
if (ctx->mem_buffer_owned) { if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer); free(ctx->mem_buffer);
@ -1592,7 +1592,7 @@ void ggml_free(struct ggml_context * ctx) {
} }
size_t ggml_used_mem(const struct ggml_context * ctx) { size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end->offset + ctx->objects_end->size; return ctx->objects_end->offs + ctx->objects_end->size;
} }
void ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { void ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@ -1610,7 +1610,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
// always insert objects at the end of the context's memory pool // always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end; struct ggml_object * obj_cur = ctx->objects_end;
const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset; const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offs;
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offset + cur_size; const size_t cur_end = cur_offset + cur_size;
@ -1623,17 +1623,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
} }
// align to GGML_MEM_ALIGN // align to GGML_MEM_ALIGN
size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN; size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
} }
size_needed += sizeof(struct ggml_tensor);
char * const mem_buffer = ctx->mem_buffer; char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
size_t scratch_size = ctx->scratch.k & 1 ? ctx->scratch.size1 : ctx->scratch.size0; if (ctx->scratch.data == NULL) {
void * scratch_data = ctx->scratch.k & 1 ? ctx->scratch.data1 : ctx->scratch.data0; size_needed += sizeof(struct ggml_tensor);
if (scratch_data == NULL) {
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
@ -1642,26 +1639,37 @@ struct ggml_tensor * ggml_new_tensor_impl(
} }
*obj_new = (struct ggml_object) { *obj_new = (struct ggml_object) {
.offset = cur_end + GGML_OBJECT_SIZE, .offs = cur_end + GGML_OBJECT_SIZE,
.size = size_needed, .size = size_needed,
.next = NULL, .next = NULL,
}; };
} else { } else if (data == NULL) {
if (size_needed > scratch_size) { if (size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
assert(false); assert(false);
return NULL; return NULL;
} }
data = scratch_data; if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
ctx->scratch.offs = 0;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
*obj_new = (struct ggml_object) { *obj_new = (struct ggml_object) {
.offset = cur_end + GGML_OBJECT_SIZE, .offs = cur_end + GGML_OBJECT_SIZE,
.size = sizeof(struct ggml_tensor), .size = sizeof(struct ggml_tensor),
.next = NULL, .next = NULL,
}; };
ctx->scratch.k++; ctx->scratch.offs += size_needed;
} }
if (obj_cur != NULL) { if (obj_cur != NULL) {
@ -1675,7 +1683,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
//GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end); //GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end);
struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset); struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
ggml_assert_aligned(result); ggml_assert_aligned(result);

@ -303,13 +303,9 @@ struct ggml_cgraph {
// scratch buffer // scratch buffer
struct ggml_scratch { struct ggml_scratch {
int k; size_t offs;
size_t size;
size_t size0; void * data;
void * data0;
size_t size1;
void * data1;
}; };
struct ggml_init_params { struct ggml_init_params {

@ -218,10 +218,10 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
static const size_t MB = 1024*1024; static const size_t MB = 1024*1024;
static const std::map<e_model, size_t> MEM_REQ_SCRATCH = { static const std::map<e_model, size_t> MEM_REQ_SCRATCH = {
{ MODEL_TINY, 32ull*MB }, { MODEL_TINY, 132ull*MB },
{ MODEL_BASE, 44ull*MB }, { MODEL_BASE, 144ull*MB },
{ MODEL_SMALL, 64ull*MB }, { MODEL_SMALL, 164ull*MB },
{ MODEL_MEDIUM, 84ull*MB }, { MODEL_MEDIUM, 184ull*MB },
{ MODEL_LARGE, 110ull*MB }, { MODEL_LARGE, 110ull*MB },
}; };
@ -1346,7 +1346,7 @@ static bool whisper_encode(
struct ggml_tensor * cur; struct ggml_tensor * cur;
ggml_set_scratch(ctx0, { 0, wctx.buf_scratch.size()/2, wctx.buf_scratch.data(), wctx.buf_scratch.size()/2, wctx.buf_scratch.data() + wctx.buf_scratch.size()/2 }); ggml_set_scratch(ctx0, { 0, wctx.buf_scratch.size(), wctx.buf_scratch.data(), });
// convolution + gelu // convolution + gelu
{ {
@ -1370,7 +1370,7 @@ static bool whisper_encode(
cur = ggml_gelu(ctx0, cur); cur = ggml_gelu(ctx0, cur);
} }
ggml_set_scratch(ctx0, { 0, 0, nullptr, 0, nullptr }); ggml_set_scratch(ctx0, { 0, 0, nullptr, });
// =================================================================== // ===================================================================
// NOTE: experimenting with partial evaluation of the encoder (ignore) // NOTE: experimenting with partial evaluation of the encoder (ignore)
@ -1411,6 +1411,8 @@ static bool whisper_encode(
struct ggml_context * ctxL = ggml_init(paramsL); struct ggml_context * ctxL = ggml_init(paramsL);
ggml_set_scratch(ctxL, { 0, wctx.buf_scratch.size(), wctx.buf_scratch.data(), });
// norm // norm
{ {
cur = ggml_norm(ctxL, inpL); cur = ggml_norm(ctxL, inpL);
@ -1423,6 +1425,8 @@ static bool whisper_encode(
ggml_repeat(ctxL, layer.attn_ln_0_b, cur)); ggml_repeat(ctxL, layer.attn_ln_0_b, cur));
} }
ggml_set_scratch(ctxL, { 0, 0, nullptr, });
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctxL, struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,

Loading…
Cancel
Save