diff --git a/.gitignore b/.gitignore index 5eb1ff1..25111eb 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ models/* arm_neon.h compile_commands.json +*.dSYM/ diff --git a/Makefile b/Makefile index 8388c29..35b627b 100644 --- a/Makefile +++ b/Makefile @@ -30,9 +30,9 @@ endif # Compile flags # -CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -LDFLAGS = +CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -g -I/opt/homebrew/include +CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -I/opt/homebrew/include +LDFLAGS = -L/opt/homebrew/lib -lsentencepiece # OS specific # TODO: support Windows diff --git a/main.cpp b/main.cpp index d1defe2..31872c3 100644 --- a/main.cpp +++ b/main.cpp @@ -10,6 +10,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -82,7 +88,7 @@ struct llama_model { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { +bool llama_model_load(const std::string & fname, llama_model & model, sentencepiece::SentencePieceProcessor & sp, gpt_vocab & vocab, int n_ctx) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -144,6 +150,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab return false; } + printf("total pieces: %d", sp.GetPieceSize()); + std::string word; for (int i = 0; i < n_vocab; i++) { uint32_t len; @@ -152,8 +160,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; + std::string wordx = sp.IdToPiece(i); + vocab.token_to_id[wordx] = i; + vocab.id_to_token[i] = wordx; //if (i < 30000) { // printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); @@ -764,6 +773,9 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; + sentencepiece::SentencePieceProcessor sp; + sp.Load("./models/tokenizer.model"); + if (gpt_params_parse(argc, argv, params) == false) { return 1; } @@ -791,7 +803,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ?? + if (!llama_model_load(params.model, model, sp, vocab, 512)) { // TODO: set context from user input ?? fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } diff --git a/utils.cpp b/utils.cpp index b340bd6..b52f419 100644 --- a/utils.cpp +++ b/utils.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -281,33 +282,30 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st std::vector res; - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos - } + // if (bos) { + // res.push_back(1); // TODO: replace with vocab.bos + // } - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; - } - } + sentencepiece::SentencePieceProcessor sp; + sp.Load("./models/tokenizer.model"); - if (l == 0) { - break; + std::vector pieces; + return sp.EncodeAsIds(text); +/* + for (const auto & piece : pieces) { + printf("piece: %s\n", piece.c_str()); + if (vocab.token_to_id.count(piece) > 0) { + res.push_back(vocab.token_to_id.at(piece)); + } else { + // handle unknown token } + } - res.push_back(t); - pos += l; + for (const auto& id : res) { + printf("%d\n", id); } - return res; + return res;*/ } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {