|
|
|
@ -1,5 +1,174 @@
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// GGML Tensor Library
|
|
|
|
|
//
|
|
|
|
|
// This documentation is still a work in progress.
|
|
|
|
|
// If you wish some specific topics to be covered, feel free to drop a comment:
|
|
|
|
|
//
|
|
|
|
|
// https://github.com/ggerganov/whisper.cpp/issues/40
|
|
|
|
|
//
|
|
|
|
|
// ## Overview
|
|
|
|
|
//
|
|
|
|
|
// This library implements:
|
|
|
|
|
//
|
|
|
|
|
// - a set of tensor operations
|
|
|
|
|
// - automatic differentiation
|
|
|
|
|
// - basic optimization algorithms
|
|
|
|
|
//
|
|
|
|
|
// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
|
|
|
|
|
// but is not limited to, the following:
|
|
|
|
|
//
|
|
|
|
|
// - linear regression
|
|
|
|
|
// - support vector machines
|
|
|
|
|
// - neural networks
|
|
|
|
|
//
|
|
|
|
|
// The library allows the user to define a certain function using the available tensor operations. This function
|
|
|
|
|
// definition is represented internally via a computation graph. Each tensor operation in the function definition
|
|
|
|
|
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
|
|
|
|
|
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
|
|
|
|
|
// using one of the available optimization algorithms.
|
|
|
|
|
//
|
|
|
|
|
// For example, here we define the function: f(x) = a*x^2 + b
|
|
|
|
|
//
|
|
|
|
|
// {
|
|
|
|
|
// struct ggml_init_params params = {
|
|
|
|
|
// .mem_size = 16*1024*1024,
|
|
|
|
|
// .mem_buffer = NULL,
|
|
|
|
|
// };
|
|
|
|
|
//
|
|
|
|
|
// // memory allocation happens here
|
|
|
|
|
// struct ggml_context * ctx = ggml_init(params);
|
|
|
|
|
//
|
|
|
|
|
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
|
|
|
|
//
|
|
|
|
|
// ggml_set_param(ctx, x); // x is an input variable
|
|
|
|
|
//
|
|
|
|
|
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
|
|
|
|
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
|
|
|
|
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
|
|
|
|
|
// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
|
|
|
|
|
//
|
|
|
|
|
// ...
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// Notice that the function definition above does not involve any actual computation. The computation is performed only
|
|
|
|
|
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
|
|
|
|
|
//
|
|
|
|
|
// {
|
|
|
|
|
// ...
|
|
|
|
|
//
|
|
|
|
|
// struct ggml_cgraph gf = ggml_build_forward(f);
|
|
|
|
|
//
|
|
|
|
|
// // set the input variable and parameter values
|
|
|
|
|
// ggml_set_f32(x, 2.0f);
|
|
|
|
|
// ggml_set_f32(a, 3.0f);
|
|
|
|
|
// ggml_set_f32(b, 4.0f);
|
|
|
|
|
//
|
|
|
|
|
// ggml_graph_compute(ctx0, &gf);
|
|
|
|
|
//
|
|
|
|
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
|
|
|
|
//
|
|
|
|
|
// ...
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// The actual computation is performed in the ggml_graph_compute() function.
|
|
|
|
|
//
|
|
|
|
|
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
|
|
|
|
|
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
|
|
|
|
|
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
|
|
|
|
|
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
|
|
|
|
|
// actually needed.
|
|
|
|
|
//
|
|
|
|
|
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
|
|
|
|
|
// differentiation and optimization algorithms.
|
|
|
|
|
//
|
|
|
|
|
// The described approach allows to define the function graph once and then compute its forward or backward graphs
|
|
|
|
|
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
|
|
|
|
|
// the user can avoid the memory allocation overhead at runtime.
|
|
|
|
|
//
|
|
|
|
|
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
|
|
|
|
|
// citizens, but in theory the library can be extended to support FP8 and integer data types.
|
|
|
|
|
//
|
|
|
|
|
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
|
|
|
|
|
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
|
|
|
|
|
// clear that the library needs to support more complex operations. The way to support these operations is not clear
|
|
|
|
|
// yet, but a few examples are demonstrated in the following operations:
|
|
|
|
|
//
|
|
|
|
|
// - ggml_permute()
|
|
|
|
|
// - ggml_conv_1d_1s()
|
|
|
|
|
// - ggml_conv_1d_2s()
|
|
|
|
|
//
|
|
|
|
|
// For each tensor operator, the library implements a forward and backward computation function. The forward function
|
|
|
|
|
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
|
|
|
|
|
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
|
|
|
|
|
// calculus class, or watch the following video:
|
|
|
|
|
//
|
|
|
|
|
// What is Automatic Differentiation?
|
|
|
|
|
// https://www.youtube.com/watch?v=wG_nF1awSSY
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// ## Tensor data (struct ggml_tensor)
|
|
|
|
|
//
|
|
|
|
|
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
|
|
|
|
|
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
|
|
|
|
|
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
|
|
|
|
|
//
|
|
|
|
|
// {
|
|
|
|
|
// struct ggml_tensor * c = ggml_add(ctx, a, b);
|
|
|
|
|
//
|
|
|
|
|
// assert(c->src[0] == a);
|
|
|
|
|
// assert(c->src[1] == b);
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
|
|
|
|
|
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
|
|
|
|
|
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
|
|
|
|
|
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
|
|
|
|
|
// contiguous in memory.
|
|
|
|
|
//
|
|
|
|
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
|
|
|
|
//
|
|
|
|
|
// {
|
|
|
|
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
|
|
|
|
//
|
|
|
|
|
// // a[1, 2] = 1.0f;
|
|
|
|
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
|
|
|
|
//
|
|
|
|
|
// // a[2, 0] = 2.0f;
|
|
|
|
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
|
|
|
|
//
|
|
|
|
|
// ...
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
|
|
|
|
|
//
|
|
|
|
|
// ## The matrix multiplication operator (ggml_mul_mat)
|
|
|
|
|
//
|
|
|
|
|
// TODO
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// ## Multi-threading
|
|
|
|
|
//
|
|
|
|
|
// TODO
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// ## Overview of ggml.c
|
|
|
|
|
//
|
|
|
|
|
// TODO
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// ## SIMD optimizations
|
|
|
|
|
//
|
|
|
|
|
// TODO
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// ## Debugging ggml
|
|
|
|
|
//
|
|
|
|
|
// TODO
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
|
extern "C" {
|
|
|
|
|
#endif
|
|
|
|
@ -21,7 +190,8 @@ typedef __fp16 ggml_fp16_t;
|
|
|
|
|
typedef uint16_t ggml_fp16_t;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
float ggml_fp16_to_fp32(ggml_fp16_t x);
|
|
|
|
|
// convert FP16 <-> FP32
|
|
|
|
|
float ggml_fp16_to_fp32(ggml_fp16_t x);
|
|
|
|
|
ggml_fp16_t ggml_fp32_to_fp16(float x);
|
|
|
|
|
|
|
|
|
|
struct ggml_object;
|
|
|
|
@ -36,6 +206,7 @@ enum ggml_type {
|
|
|
|
|
GGML_TYPE_COUNT,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// available tensor operations:
|
|
|
|
|
enum ggml_op {
|
|
|
|
|
GGML_OP_NONE = 0,
|
|
|
|
|
|
|
|
|
@ -136,7 +307,7 @@ struct ggml_init_params {
|
|
|
|
|
void * mem_buffer; // if NULL, memory will be allocated internally
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void ggml_time_init(void);
|
|
|
|
|
void ggml_time_init(void); // call this once at the beginning of the program
|
|
|
|
|
int64_t ggml_time_ms(void);
|
|
|
|
|
int64_t ggml_time_us(void);
|
|
|
|
|
int64_t ggml_cycles(void);
|
|
|
|
|