whisper.cpp/ggml.h

#pragma once

#ifdef  __cplusplus
extern "C" {
#endif

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>

#define GGML_MAX_DIMS     4
#define GGML_MAX_NODES    4096
#define GGML_MAX_PARAMS   16
#define GGML_MAX_CONTEXTS 16
#define GGML_MAX_OPT      4

#ifdef __ARM_NEON
// we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
#endif

float ggml_fp16_to_fp32(ggml_fp16_t x);
ggml_fp16_t ggml_fp32_to_fp16(float x);

struct ggml_object;
struct ggml_context;

enum ggml_type {
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
    GGML_TYPE_F16,
    GGML_TYPE_F32,
    GGML_TYPE_COUNT,
};

enum ggml_op {
    GGML_OP_NONE = 0,

    GGML_OP_DUP,
    GGML_OP_ADD,
    GGML_OP_SUB,
    GGML_OP_MUL,
    GGML_OP_DIV,
    GGML_OP_SQR,
    GGML_OP_SQRT,
    GGML_OP_SUM,
    GGML_OP_MEAN,
    GGML_OP_REPEAT,
    GGML_OP_ABS,
    GGML_OP_SGN,
    GGML_OP_NEG,
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
    GGML_OP_NORM, // normalize

    GGML_OP_MUL_MAT,

    GGML_OP_SCALE,
    GGML_OP_CPY,
    GGML_OP_RESHAPE,
    GGML_OP_VIEW,
    GGML_OP_PERMUTE,
    GGML_OP_TRANSPOSE,
    GGML_OP_GET_ROWS,
    GGML_OP_DIAG_MASK_INF,
    GGML_OP_SOFT_MAX,
    GGML_OP_ROPE,
    GGML_OP_CONV_1D_1S,
    GGML_OP_CONV_1D_2S,

    GGML_OP_FLASH_ATTN,
    GGML_OP_FLASH_FF,

    GGML_OP_COUNT,
};

// n-dimensional tensor
struct ggml_tensor {
    enum ggml_type type;

    int    n_dims;
    int    ne[GGML_MAX_DIMS]; // number of elements
    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
                              // nb[0] = sizeof(type)
                              // nb[1] = nb[0]   * ne[0] + padding
                              // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;

    bool is_param;

    struct ggml_tensor * grad;
    struct ggml_tensor * src0;
    struct ggml_tensor * src1;
    struct ggml_tensor * opt[GGML_MAX_OPT];

    // thread scheduling
    int n_tasks;

    // performance
    int     perf_runs;
    int64_t perf_cycles;
    int64_t perf_time_us;

    void * data;
    char padding[8];
};

// computation graph
struct ggml_cgraph {
    int n_nodes;
    int n_leafs;
    int n_threads;

    size_t work_size;
    struct ggml_tensor * work;

    struct ggml_tensor * nodes[GGML_MAX_NODES];
    struct ggml_tensor * grads[GGML_MAX_NODES];
    struct ggml_tensor * leafs[GGML_MAX_NODES];

    // performance
    int     perf_runs;
    int64_t perf_cycles;
    int64_t perf_time_us;
};

struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
};

void ggml_time_init(void);
int64_t ggml_time_ms(void);
int64_t ggml_time_us(void);
int64_t ggml_cycles(void);
int64_t ggml_cycles_per_ms(void);

void ggml_print_object (const struct ggml_object * obj);
void ggml_print_objects(const struct ggml_context * ctx);

int    ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes   (const struct ggml_tensor * tensor);

size_t ggml_type_size   (enum ggml_type type);
size_t ggml_element_size(const struct ggml_tensor * tensor);

struct ggml_context * ggml_init(struct ggml_init_params params);
void ggml_free(struct ggml_context * ctx);

size_t ggml_used_mem(const struct ggml_context * ctx);

struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
        const int *ne);

struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0);

struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1);

struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1,
        int    ne2);

struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1,
        int    ne2,
        int    ne3);

struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);

float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

 void * ggml_get_data    (const struct ggml_tensor * tensor);
float * ggml_get_data_f32(const struct ggml_tensor * tensor);

//
// operations on tensors with backpropagation
//

struct ggml_tensor * ggml_dup(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_add(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_mul(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_div(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_sqr(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_sqrt(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// return scalar
// TODO: compute sum along rows
struct ggml_tensor * ggml_sum(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// mean along rows
struct ggml_tensor * ggml_mean(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b
struct ggml_tensor * ggml_repeat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_abs(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_sgn(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_neg(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_step(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// TODO: double-check this computation is correct
struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows
struct ggml_tensor * ggml_mul_mat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

//
// operations on tensors without backpropagation
//

// in-place, returns view(a)
struct ggml_tensor * ggml_scale(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

// a -> b, return view(b)
struct ggml_tensor * ggml_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

// return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1);

// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        int                   ne2);

// offset in bytes
struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        size_t                offset);

struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);

struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   axis0,
        int                   axis1,
        int                   axis2,
        int                   axis3);

// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
struct ggml_tensor * ggml_transpose(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

struct ggml_tensor * ggml_get_rows(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

// set elements above the diagonal to -INF
// in-place, returns view(a)
struct ggml_tensor * ggml_diag_mask_inf(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past);

// in-place, returns view(a)
struct ggml_tensor * ggml_soft_max(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

// rotary position embedding
// in-place, returns view(a)
// if mode == 1, skip n_past elements
// TODO: avoid creating a new tensor every time
struct ggml_tensor * ggml_rope(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        int                   n_dims,
        int                   mode);

// padding = 1
// TODO: we don't support extra parameters for now
//       that's why we are hard-coding the stride, padding, and dilation
//       not great ..
struct ggml_tensor * ggml_conv_1d_1s(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_conv_1d_2s(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

struct ggml_tensor * ggml_flash_attn(
        struct ggml_context * ctx,
        struct ggml_tensor  * q,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        bool                  masked);

struct ggml_tensor * ggml_flash_ff(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b0,
        struct ggml_tensor  * b1,
        struct ggml_tensor  * c0,
        struct ggml_tensor  * c1);

//
// automatic differentiation
//

void ggml_set_param(
        struct ggml_context * ctx,
        struct ggml_tensor * tensor);

void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
void ggml_graph_reset  (struct ggml_cgraph * cgraph);

// print info and performance information for the graph
void ggml_graph_print(const struct ggml_cgraph * cgraph);

// dump the graph into a file using the dot format
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);

//
// optimization
//

// optimization methods
enum ggml_opt_type {
    GGML_OPT_ADAM,
    GGML_OPT_LBFGS,
};

// linesearch methods
enum ggml_linesearch {
    GGML_LINESEARCH_DEFAULT = 1,

    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
};

// optimization return values
enum ggml_opt_result {
    GGML_OPT_OK = 0,
    GGML_OPT_DID_NOT_CONVERGE,
    GGML_OPT_NO_CONTEXT,
    GGML_OPT_INVALID_WOLFE,
    GGML_OPT_FAIL,

    GGML_LINESEARCH_FAIL = -128,
    GGML_LINESEARCH_MINIMUM_STEP,
    GGML_LINESEARCH_MAXIMUM_STEP,
    GGML_LINESEARCH_MAXIMUM_ITERATIONS,
    GGML_LINESEARCH_INVALID_PARAMETERS,
};

// optimization parameters
//
//   see ggml.c (ggml_opt_default_params) for default values
//
struct ggml_opt_params {
    enum ggml_opt_type type;

    int n_threads;

    // delta-based convergence test
    //
    //   if past == 0 - disabled
    //   if past > 0:
    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
    //
    int past;
    float delta;

    // maximum number of iterations without improvement
    //
    //   if 0 - disabled
    //   if > 0:
    //     assume convergence if no cost improvement in this number of iterations
    //
    int max_no_improvement;

    bool print_forward_graph;
    bool print_backward_graph;

    union {
        // ADAM parameters
        struct {
            int n_iter;

            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
        } adam;

        // LBFGS parameters
        struct {
            int m; // number of corrections to approximate the inv. Hessian
            int n_iter;
            int max_linesearch;

            float eps;      // convergence tolerance
            float ftol;     // line search tolerance
            float wolfe;
            float min_step;
            float max_step;

            enum ggml_linesearch linesearch;
        } lbfgs;
    };
};

struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);

// optimize the function defined by the tensor f
enum ggml_opt_result ggml_opt(
        struct ggml_context * ctx,
        struct ggml_opt_params params,
        struct ggml_tensor * f);

#ifdef  __cplusplus
}
#endif
Initial release 2 years ago			`#pragma once`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`#include <stdint.h>`
			`#include <stddef.h>`
			`#include <stdbool.h>`

			`#define GGML_MAX_DIMS 4`
			`#define GGML_MAX_NODES 4096`
			`#define GGML_MAX_PARAMS 16`
			`#define GGML_MAX_CONTEXTS 16`
Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`#define GGML_MAX_OPT 4`
Initial release 2 years ago
			`#ifdef __ARM_NEON`
			`// we use the built-in 16-bit float type`
			`typedef __fp16 ggml_fp16_t;`
			`#else`
			`typedef uint16_t ggml_fp16_t;`
			`#endif`

			`float ggml_fp16_to_fp32(ggml_fp16_t x);`
			`ggml_fp16_t ggml_fp32_to_fp16(float x);`

			`struct ggml_object;`
			`struct ggml_context;`

			`enum ggml_type {`
			`GGML_TYPE_I8,`
			`GGML_TYPE_I16,`
			`GGML_TYPE_I32,`
			`GGML_TYPE_F16,`
			`GGML_TYPE_F32,`
			`GGML_TYPE_COUNT,`
			`};`

			`enum ggml_op {`
			`GGML_OP_NONE = 0,`

			`GGML_OP_DUP,`
			`GGML_OP_ADD,`
			`GGML_OP_SUB,`
			`GGML_OP_MUL,`
			`GGML_OP_DIV,`
			`GGML_OP_SQR,`
			`GGML_OP_SQRT,`
			`GGML_OP_SUM,`
			`GGML_OP_MEAN,`
			`GGML_OP_REPEAT,`
			`GGML_OP_ABS,`
			`GGML_OP_SGN,`
			`GGML_OP_NEG,`
			`GGML_OP_STEP,`
			`GGML_OP_RELU,`
			`GGML_OP_GELU,`
			`GGML_OP_NORM, // normalize`

			`GGML_OP_MUL_MAT,`

			`GGML_OP_SCALE,`
			`GGML_OP_CPY,`
			`GGML_OP_RESHAPE,`
			`GGML_OP_VIEW,`
			`GGML_OP_PERMUTE,`
			`GGML_OP_TRANSPOSE,`
			`GGML_OP_GET_ROWS,`
			`GGML_OP_DIAG_MASK_INF,`
			`GGML_OP_SOFT_MAX,`
			`GGML_OP_ROPE,`
			`GGML_OP_CONV_1D_1S,`
			`GGML_OP_CONV_1D_2S,`

Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`GGML_OP_FLASH_ATTN,`
			`GGML_OP_FLASH_FF,`

Initial release 2 years ago			`GGML_OP_COUNT,`
			`};`

			`// n-dimensional tensor`
			`struct ggml_tensor {`
			`enum ggml_type type;`

			`int n_dims;`
			`int ne[GGML_MAX_DIMS]; // number of elements`
			`size_t nb[GGML_MAX_DIMS]; // stride in bytes:`
			`// nb[0] = sizeof(type)`
			`// nb[1] = nb[0] * ne[0] + padding`
			`// nb[i] = nb[i-1] * ne[i-1]`

			`// compute data`
			`enum ggml_op op;`

			`bool is_param;`

			`struct ggml_tensor * grad;`
			`struct ggml_tensor * src0;`
			`struct ggml_tensor * src1;`
Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`struct ggml_tensor * opt[GGML_MAX_OPT];`
Initial release 2 years ago
			`// thread scheduling`
			`int n_tasks;`

			`// performance`
			`int perf_runs;`
			`int64_t perf_cycles;`
			`int64_t perf_time_us;`

			`void * data;`
wip : rpi4 support 2 years ago			`char padding[8];`
Initial release 2 years ago			`};`

			`// computation graph`
			`struct ggml_cgraph {`
			`int n_nodes;`
			`int n_leafs;`
			`int n_threads;`

			`size_t work_size;`
			`struct ggml_tensor * work;`

			`struct ggml_tensor * nodes[GGML_MAX_NODES];`
			`struct ggml_tensor * grads[GGML_MAX_NODES];`
			`struct ggml_tensor * leafs[GGML_MAX_NODES];`

			`// performance`
			`int perf_runs;`
			`int64_t perf_cycles;`
			`int64_t perf_time_us;`
			`};`

			`struct ggml_init_params {`
			`// memory pool`
			`size_t mem_size; // bytes`
			`void * mem_buffer; // if NULL, memory will be allocated internally`
			`};`

Building with MSVC 2 years ago			`void ggml_time_init(void);`
Initial release 2 years ago			`int64_t ggml_time_ms(void);`
			`int64_t ggml_time_us(void);`
			`int64_t ggml_cycles(void);`
			`int64_t ggml_cycles_per_ms(void);`

			`void ggml_print_object (const struct ggml_object * obj);`
			`void ggml_print_objects(const struct ggml_context * ctx);`

			`int ggml_nelements(const struct ggml_tensor * tensor);`
			`size_t ggml_nbytes (const struct ggml_tensor * tensor);`

			`size_t ggml_type_size (enum ggml_type type);`
			`size_t ggml_element_size(const struct ggml_tensor * tensor);`

			`struct ggml_context * ggml_init(struct ggml_init_params params);`
			`void ggml_free(struct ggml_context * ctx);`

			`size_t ggml_used_mem(const struct ggml_context * ctx);`

			`struct ggml_tensor * ggml_new_tensor(`
			`struct ggml_context * ctx,`
			`enum ggml_type type,`
			`int n_dims,`
			`const int *ne);`

			`struct ggml_tensor * ggml_new_tensor_1d(`
			`struct ggml_context * ctx,`
			`enum ggml_type type,`
			`int ne0);`

			`struct ggml_tensor * ggml_new_tensor_2d(`
			`struct ggml_context * ctx,`
			`enum ggml_type type,`
			`int ne0,`
			`int ne1);`

			`struct ggml_tensor * ggml_new_tensor_3d(`
			`struct ggml_context * ctx,`
			`enum ggml_type type,`
			`int ne0,`
			`int ne1,`
			`int ne2);`

			`struct ggml_tensor * ggml_new_tensor_4d(`
			`struct ggml_context * ctx,`
			`enum ggml_type type,`
			`int ne0,`
			`int ne1,`
			`int ne2,`
			`int ne3);`

Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);`
Initial release 2 years ago			`struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);`

			`struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);`
			`struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);`

			`struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);`
Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);`
Initial release 2 years ago			`struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);`

Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);`
			`void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);`

Initial release 2 years ago			`float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);`
			`void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);`

			`void * ggml_get_data (const struct ggml_tensor * tensor);`
			`float * ggml_get_data_f32(const struct ggml_tensor * tensor);`

			`//`
			`// operations on tensors with backpropagation`
			`//`

			`struct ggml_tensor * ggml_dup(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_add(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_sub(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_mul(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_div(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_sqr(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_sqrt(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// return scalar`
			`// TODO: compute sum along rows`
			`struct ggml_tensor * ggml_sum(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// mean along rows`
			`struct ggml_tensor * ggml_mean(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// if a is the same shape as b, and a is not parameter, return a`
			`// otherwise, return a new tensor: repeat(a) to fit in b`
			`struct ggml_tensor * ggml_repeat(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_abs(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_sgn(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_neg(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_step(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_relu(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// TODO: double-check this computation is correct`
			`struct ggml_tensor * ggml_gelu(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// normalize along rows`
			`// TODO: eps is hardcoded to 1e-5 for now`
			`struct ggml_tensor * ggml_norm(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// A: m rows, n columns`
			`// B: p rows, n columns (i.e. we transpose it internally)`
			`// result is m columns, p rows`
			`struct ggml_tensor * ggml_mul_mat(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`//`
			`// operations on tensors without backpropagation`
			`//`

			`// in-place, returns view(a)`
			`struct ggml_tensor * ggml_scale(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`// a -> b, return view(b)`
			`struct ggml_tensor * ggml_cpy(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`// return view(a), b specifies the new shape`
			`// TODO: when we start computing gradient, make a copy instead of view`
			`struct ggml_tensor * ggml_reshape(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`// return view(a)`
			`// TODO: when we start computing gradient, make a copy instead of view`
			`struct ggml_tensor * ggml_reshape_2d(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int ne0,`
			`int ne1);`

			`// return view(a)`
			`// TODO: when we start computing gradient, make a copy instead of view`
			`struct ggml_tensor * ggml_reshape_3d(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int ne0,`
			`int ne1,`
			`int ne2);`

			`// offset in bytes`
			`struct ggml_tensor * ggml_view_1d(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int ne0,`
			`size_t offset);`

			`struct ggml_tensor * ggml_view_2d(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int ne0,`
			`int ne1,`
			`size_t nb1, // row stride in bytes`
			`size_t offset);`

			`struct ggml_tensor * ggml_permute(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int axis0,`
			`int axis1,`
			`int axis2,`
			`int axis3);`

			`// alias for ggml_permute(ctx, a, 1, 0, 2, 3)`
			`struct ggml_tensor * ggml_transpose(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`struct ggml_tensor * ggml_get_rows(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`// set elements above the diagonal to -INF`
			`// in-place, returns view(a)`
			`struct ggml_tensor * ggml_diag_mask_inf(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int n_past);`

			`// in-place, returns view(a)`
			`struct ggml_tensor * ggml_soft_max(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a);`

			`// rotary position embedding`
			`// in-place, returns view(a)`
			`// if mode == 1, skip n_past elements`
			`// TODO: avoid creating a new tensor every time`
			`struct ggml_tensor * ggml_rope(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`int n_past,`
			`int n_dims,`
			`int mode);`

			`// padding = 1`
			`// TODO: we don't support extra parameters for now`
			`// that's why we are hard-coding the stride, padding, and dilation`
			`// not great ..`
			`struct ggml_tensor * ggml_conv_1d_1s(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

			`struct ggml_tensor * ggml_conv_1d_2s(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b);`

Flash + language support (ref #2) - Achieved big performance improvement + memory usage reduction - Can now translate / transcribe different languages 2 years ago			`struct ggml_tensor * ggml_flash_attn(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * q,`
			`struct ggml_tensor * k,`
			`struct ggml_tensor * v,`
			`bool masked);`

			`struct ggml_tensor * ggml_flash_ff(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * a,`
			`struct ggml_tensor * b0,`
			`struct ggml_tensor * b1,`
			`struct ggml_tensor * c0,`
			`struct ggml_tensor * c1);`

Initial release 2 years ago			`//`
			`// automatic differentiation`
			`//`

			`void ggml_set_param(`
			`struct ggml_context * ctx,`
			`struct ggml_tensor * tensor);`

			`void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);`

			`struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);`
			`struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);`

			`void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);`
			`void ggml_graph_reset (struct ggml_cgraph * cgraph);`

			`// print info and performance information for the graph`
			`void ggml_graph_print(const struct ggml_cgraph * cgraph);`

			`// dump the graph into a file using the dot format`
			`void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);`

			`//`
			`// optimization`
			`//`

			`// optimization methods`
			`enum ggml_opt_type {`
			`GGML_OPT_ADAM,`
			`GGML_OPT_LBFGS,`
			`};`

			`// linesearch methods`
			`enum ggml_linesearch {`
			`GGML_LINESEARCH_DEFAULT = 1,`

			`GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,`
			`GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,`
			`GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,`
			`};`

			`// optimization return values`
			`enum ggml_opt_result {`
			`GGML_OPT_OK = 0,`
			`GGML_OPT_DID_NOT_CONVERGE,`
			`GGML_OPT_NO_CONTEXT,`
			`GGML_OPT_INVALID_WOLFE,`
			`GGML_OPT_FAIL,`

			`GGML_LINESEARCH_FAIL = -128,`
			`GGML_LINESEARCH_MINIMUM_STEP,`
			`GGML_LINESEARCH_MAXIMUM_STEP,`
			`GGML_LINESEARCH_MAXIMUM_ITERATIONS,`
			`GGML_LINESEARCH_INVALID_PARAMETERS,`
			`};`

			`// optimization parameters`
			`//`
			`// see ggml.c (ggml_opt_default_params) for default values`
			`//`
			`struct ggml_opt_params {`
			`enum ggml_opt_type type;`

			`int n_threads;`

			`// delta-based convergence test`
			`//`
			`// if past == 0 - disabled`
			`// if past > 0:`
			`// stop if \|f(x) - f(x_past)\| < delta * max(1, \|f(x)\|)`
			`//`
			`int past;`
			`float delta;`

			`// maximum number of iterations without improvement`
			`//`
			`// if 0 - disabled`
			`// if > 0:`
			`// assume convergence if no cost improvement in this number of iterations`
			`//`
			`int max_no_improvement;`

			`bool print_forward_graph;`
			`bool print_backward_graph;`

			`union {`
			`// ADAM parameters`
			`struct {`
			`int n_iter;`

			`float alpha; // learning rate`
			`float beta1;`
			`float beta2;`
			`float eps; // epsilon for numerical stability`
			`float eps_f; // epsilon for convergence test`
			`float eps_g; // epsilon for convergence test`
			`} adam;`

			`// LBFGS parameters`
			`struct {`
			`int m; // number of corrections to approximate the inv. Hessian`
			`int n_iter;`
			`int max_linesearch;`

			`float eps; // convergence tolerance`
			`float ftol; // line search tolerance`
			`float wolfe;`
			`float min_step;`
			`float max_step;`

			`enum ggml_linesearch linesearch;`
			`} lbfgs;`
			`};`
			`};`

			`struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);`

			`// optimize the function defined by the tensor f`
			`enum ggml_opt_result ggml_opt(`
			`struct ggml_context * ctx,`
			`struct ggml_opt_params params,`
			`struct ggml_tensor * f);`

			`#ifdef __cplusplus`
			`}`
			`#endif`