From e236dbb1e9099464029cb2e24907709b2a4cc66b Mon Sep 17 00:00:00 2001 From: beiller Date: Sat, 11 Mar 2023 16:32:21 -0500 Subject: [PATCH] Remove unprintable characters from vocab list --- main.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main.cpp b/main.cpp index 2f47480..bc93586 100644 --- a/main.cpp +++ b/main.cpp @@ -139,6 +139,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); + + if(i >= 131 && i <= 258) { + // seems to be unprintable characters list in this range + // TODO maybe they are supposed to be byte reversed or some magic + continue; + } vocab.token_to_id[word] = i; vocab.id_to_token[i] = word;