@ -11,6 +11,14 @@
# include <string>
# include <vector>
// determine number of model parts based on the dimension
static const std : : map < int , int > LLAMA_N_PARTS = {
{ 4096 , 1 } ,
{ 5120 , 2 } ,
{ 6656 , 4 } ,
{ 8192 , 8 } ,
} ;
// default hparams (LLaMA 7B)
struct llama_hparams {
int32_t n_vocab = 32000 ;
@ -82,6 +90,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
}
int n_ff = 0 ;
int n_parts = 0 ;
// load hparams
{
@ -99,6 +108,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
hparams . n_ctx = n_ctx ;
n_ff = ( ( 2 * ( 4 * hparams . n_embd ) / 3 + hparams . n_mult - 1 ) / hparams . n_mult ) * hparams . n_mult ;
n_parts = LLAMA_N_PARTS . at ( hparams . n_embd ) ;
printf ( " %s: n_vocab = %d \n " , __func__ , hparams . n_vocab ) ;
printf ( " %s: n_ctx = %d \n " , __func__ , hparams . n_ctx ) ;
@ -109,6 +119,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
printf ( " %s: n_rot = %d \n " , __func__ , hparams . n_rot ) ;
printf ( " %s: f16 = %d \n " , __func__ , hparams . f16 ) ;
printf ( " %s: n_ff = %d \n " , __func__ , n_ff ) ;
printf ( " %s: n_parts = %d \n " , __func__ , n_parts ) ;
}
// load vocab
@ -220,7 +231,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
model . layers . resize ( n_layer ) ;
model . tok_embeddings = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_vocab ) ;
model . tok_embeddings = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_vocab ) ;
model . norm = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_embd ) ;
model . output = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_vocab ) ;
@ -234,14 +245,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
for ( int i = 0 ; i < n_layer ; + + i ) {
auto & layer = model . layers [ i ] ;
layer . attention_norm = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_embd ) ;
layer . attention_norm = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_embd ) ;
layer . wq = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wk = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wv = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wo = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wq = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wk = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wv = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . wo = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_embd ) ;
layer . ffn_norm = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_embd ) ;
layer . ffn_norm = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_embd ) ;
layer . w1 = ggml_new_tensor_2d ( ctx , wtype , n_embd , n_ff ) ;
layer . w2 = ggml_new_tensor_2d ( ctx , wtype , n_ff , n_embd ) ;
@ -282,94 +293,208 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
printf ( " %s: memory_size = %8.2f MB, n_mem = %d \n " , __func__ , memory_size / 1024.0 / 1024.0 , n_mem ) ;
}
// load weights
{
int n_tensors = 0 ;
size_t total_size = 0 ;
const size_t file_offset = fin . tellg ( ) ;
printf ( " %s: " , __func__ ) ;
fin . close ( ) ;
while ( true ) {
int32_t n_dims ;
int32_t length ;
int32_t ftype ;
std : : vector < uint8_t > tmp ;
fin . read ( reinterpret_cast < char * > ( & n_dims ) , sizeof ( n_dims ) ) ;
fin . read ( reinterpret_cast < char * > ( & length ) , sizeof ( length ) ) ;
fin . read ( reinterpret_cast < char * > ( & ftype ) , sizeof ( ftype ) ) ;
for ( int i = 0 ; i < n_parts ; + + i ) {
const int part_id = i ;
//const int part_id = n_parts - i - 1;
if ( fin . eof ( ) ) {
break ;
}
std : : string fname_part = fname ;
if ( i > 0 ) {
fname_part + = " . " + std : : to_string ( i ) ;
}
int32_t nelements = 1 ;
int32_t ne [ 2 ] = { 1 , 1 } ;
for ( int i = 0 ; i < n_dims ; + + i ) {
fin . read ( reinterpret_cast < char * > ( & ne [ i ] ) , sizeof ( ne [ i ] ) ) ;
nelements * = ne [ i ] ;
}
printf ( " %s: loading model part %d/%d from '%s' \n " , __func__ , i + 1 , n_parts , fname_part . c_str ( ) ) ;
std : : string name ( length , 0 ) ;
fin . read ( & name [ 0 ] , length ) ;
fin = std : : ifstream ( fname_part , std : : ios : : binary ) ;
fin . seekg ( file_offset ) ;
if ( model . tensors . find ( name . data ( ) ) = = model . tensors . end ( ) ) {
fprintf ( stderr , " %s: unknown tensor '%s' in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
// load weights
{
int n_tensors = 0 ;
size_t total_size = 0 ;
auto tensor = model . tensors [ name . data ( ) ] ;
if ( ggml_nelements ( tensor ) ! = nelements ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
printf ( " %s: " , __func__ ) ;
if ( tensor - > ne [ 0 ] ! = ne [ 0 ] | | tensor - > ne [ 1 ] ! = ne [ 1 ] ) {
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d] \n " ,
__func__ , name . data ( ) , tensor - > ne [ 0 ] , tensor - > ne [ 1 ] , ne [ 0 ] , ne [ 1 ] ) ;
return false ;
}
while ( true ) {
int32_t n_dims ;
int32_t length ;
int32_t ftype ;
if ( 0 ) {
static const char * ftype_str [ ] = { " f32 " , " f16 " , " q4_0 " , " q4_1 " , } ;
printf ( " %24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes \n " , name . data ( ) , ne [ 0 ] , ne [ 1 ] , ftype_str [ ftype ] , ggml_nbytes ( tensor ) / 1024.0 / 1024.0 , ggml_nbytes ( tensor ) ) ;
}
fin . read ( reinterpret_cast < char * > ( & n_dims ) , sizeof ( n_dims ) ) ;
fin . read ( reinterpret_cast < char * > ( & length ) , sizeof ( length ) ) ;
fin . read ( reinterpret_cast < char * > ( & ftype ) , sizeof ( ftype ) ) ;
if ( fin . eof ( ) ) {
break ;
}
int32_t nelements = 1 ;
int32_t ne [ 2 ] = { 1 , 1 } ;
for ( int i = 0 ; i < n_dims ; + + i ) {
fin . read ( reinterpret_cast < char * > ( & ne [ i ] ) , sizeof ( ne [ i ] ) ) ;
nelements * = ne [ i ] ;
}
size_t bpe = 0 ;
std : : string name ( length , 0 ) ;
fin . read ( & name [ 0 ] , length ) ;
switch ( ftype ) {
case 0 : bpe = ggml_type_size ( GGML_TYPE_F32 ) ; break ;
case 1 : bpe = ggml_type_size ( GGML_TYPE_F16 ) ; break ;
case 2 : bpe = ggml_type_size ( GGML_TYPE_Q4_0 ) ; assert ( ne [ 0 ] % 64 = = 0 ) ; break ;
case 3 : bpe = ggml_type_size ( GGML_TYPE_Q4_1 ) ; assert ( ne [ 0 ] % 64 = = 0 ) ; break ;
default :
{
fprintf ( stderr , " %s: unknown ftype %d in model file \n " , __func__ , ftype ) ;
if ( model . tensors . find ( name . data ( ) ) = = model . tensors . end ( ) ) {
fprintf ( stderr , " %s: unknown tensor '%s' in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
// split_type = 0: split by columns
// split_type = 1: split by rows
int split_type = 0 ;
// split_type = 0:
// regex:
// - tok_embeddings.*
// - layers.*.attention.wo.weight
// - layers.*.feed_forward.w2.weight
// split_type = 1:
// regex:
// - output.*
// - layers.*.attention.wq.weight
// - layers.*.attention.wk.weight
// - layers.*.attention.wv.weight
// - layers.*.feed_forward.w1.weight
// - layers.*.feed_forward.w3.weight
if ( name . find ( " tok_embeddings " ) ! = std : : string : : npos ) {
split_type = 0 ;
} else if ( name . find ( " layers " ) ! = std : : string : : npos ) {
if ( name . find ( " attention.wo.weight " ) ! = std : : string : : npos ) {
split_type = 0 ;
} else if ( name . find ( " feed_forward.w2.weight " ) ! = std : : string : : npos ) {
split_type = 0 ;
} else {
split_type = 1 ;
}
} else if ( name . find ( " output " ) ! = std : : string : : npos ) {
split_type = 1 ;
}
auto tensor = model . tensors [ name . data ( ) ] ;
if ( n_dims = = 1 ) {
if ( ggml_nelements ( tensor ) ! = nelements ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
} else {
if ( ggml_nelements ( tensor ) / n_parts ! = nelements ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
}
if ( n_dims = = 1 ) {
if ( tensor - > ne [ 0 ] ! = ne [ 0 ] | | tensor - > ne [ 1 ] ! = ne [ 1 ] ) {
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d] \n " ,
__func__ , name . data ( ) , tensor - > ne [ 0 ] , tensor - > ne [ 1 ] , ne [ 0 ] , ne [ 1 ] ) ;
return false ;
}
} else {
if ( split_type = = 0 ) {
if ( tensor - > ne [ 0 ] / n_parts ! = ne [ 0 ] | | tensor - > ne [ 1 ] ! = ne [ 1 ] ) {
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d] \n " ,
__func__ , name . data ( ) , tensor - > ne [ 0 ] / n_parts , tensor - > ne [ 1 ] , ne [ 0 ] , ne [ 1 ] ) ;
return false ;
}
} else {
if ( tensor - > ne [ 0 ] ! = ne [ 0 ] | | tensor - > ne [ 1 ] / n_parts ! = ne [ 1 ] ) {
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d] \n " ,
__func__ , name . data ( ) , tensor - > ne [ 0 ] , tensor - > ne [ 1 ] / n_parts , ne [ 0 ] , ne [ 1 ] ) ;
return false ;
}
} ;
}
}
if ( ( nelements * bpe ) / ggml_blck_size ( tensor - > type ) ! = ggml_nbytes ( tensor ) ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file: got %zu, expected %zu \n " ,
__func__ , name . data ( ) , ggml_nbytes ( tensor ) , nelements * bpe ) ;
return false ;
}
if ( 0 ) {
static const char * ftype_str [ ] = { " f32 " , " f16 " , " q4_0 " , " q4_1 " , } ;
printf ( " %24s - [%5d, %5d], type = %6s, split = %d \n " , name . data ( ) , ne [ 0 ] , ne [ 1 ] , ftype_str [ ftype ] , split_type ) ;
}
size_t bpe = 0 ;
switch ( ftype ) {
case 0 : bpe = ggml_type_size ( GGML_TYPE_F32 ) ; break ;
case 1 : bpe = ggml_type_size ( GGML_TYPE_F16 ) ; break ;
case 2 : bpe = ggml_type_size ( GGML_TYPE_Q4_0 ) ; assert ( ne [ 0 ] % 64 = = 0 ) ; break ;
case 3 : bpe = ggml_type_size ( GGML_TYPE_Q4_1 ) ; assert ( ne [ 0 ] % 64 = = 0 ) ; break ;
default :
{
fprintf ( stderr , " %s: unknown ftype %d in model file \n " , __func__ , ftype ) ;
return false ;
}
} ;
if ( n_dims = = 1 | | n_parts = = 1 ) {
if ( ( nelements * bpe ) / ggml_blck_size ( tensor - > type ) ! = ggml_nbytes ( tensor ) ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file: got %zu, expected %zu \n " ,
__func__ , name . data ( ) , ggml_nbytes ( tensor ) , nelements * bpe ) ;
return false ;
}
if ( part_id = = 0 ) {
fin . read ( reinterpret_cast < char * > ( tensor - > data ) , ggml_nbytes ( tensor ) ) ;
} else {
fin . seekg ( ggml_nbytes ( tensor ) , std : : ios : : cur ) ;
}
total_size + = ggml_nbytes ( tensor ) ;
} else {
if ( ( nelements * bpe ) / ggml_blck_size ( tensor - > type ) ! = ggml_nbytes ( tensor ) / n_parts ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file: got %zu, expected %zu \n " ,
__func__ , name . data ( ) , ggml_nbytes ( tensor ) / n_parts , nelements * bpe ) ;
return false ;
}
if ( split_type = = 0 ) {
const int np0 = ne [ 0 ] ;
const size_t row_size = ( tensor - > ne [ 0 ] / ggml_blck_size ( tensor - > type ) ) * ggml_type_size ( tensor - > type ) ;
assert ( row_size = = tensor - > nb [ 1 ] ) ;
for ( int i1 = 0 ; i1 < ne [ 1 ] ; + + i1 ) {
const size_t offset_row = i1 * row_size ;
const size_t offset = offset_row + ( ( part_id * np0 ) / ggml_blck_size ( tensor - > type ) ) * ggml_type_size ( tensor - > type ) ;
fin . read ( reinterpret_cast < char * > ( tensor - > data ) + offset , row_size / n_parts ) ;
}
} else {
const int np1 = ne [ 1 ] ;
fin . read ( reinterpret_cast < char * > ( tensor - > data ) , ggml_nbytes ( tensor ) ) ;
const size_t row_size = ( tensor - > ne [ 0 ] / ggml_blck_size ( tensor - > type ) ) * ggml_type_size ( tensor - > type ) ;
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size + = ggml_nbytes ( tensor ) ;
if ( + + n_tensors % 8 = = 0 ) {
printf ( " . " ) ;
fflush ( stdout ) ;
for ( int i1 = 0 ; i1 < ne [ 1 ] ; + + i1 ) {
const size_t offset_row = ( i1 + part_id * np1 ) * row_size ;
fin . read ( reinterpret_cast < char * > ( tensor - > data ) + offset_row , row_size ) ;
}
}
total_size + = ggml_nbytes ( tensor ) / n_parts ;
}
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
if ( + + n_tensors % 8 = = 0 ) {
printf ( " . " ) ;
fflush ( stdout ) ;
}
}
}
printf ( " done \n " ) ;
printf ( " done \n " ) ;
printf ( " %s: model size = %8.2f MB / num tensors = %d \n " , __func__ , total_size / 1024.0 / 1024.0 , n_tensors ) ;
}
printf ( " %s: model size = %8.2f MB / num tensors = %d \n " , __func__ , total_size / 1024.0 / 1024.0 , n_tensors ) ;
}
fin . close ( ) ;
fin . close ( ) ;
}
return true ;
}