diff --git a/main.cpp b/main.cpp index 2f47480..67ac8f1 100644 --- a/main.cpp +++ b/main.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // determine number of model parts based on the dimension static const std::map LLAMA_N_PARTS = { @@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } // load vocab + + std::unordered_set unprintable_characters = {"", "�", "��"}; + { const int32_t n_vocab = model.hparams.n_vocab; @@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); + if(unprintable_characters.find(word) != unprintable_characters.end()) { + continue; + } + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word;