Remove unprintable characters from vocab list

pull/25/head
beiller 1 year ago committed by GitHub
parent 4235e3d5b3
commit e236dbb1e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -139,6 +139,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
word.resize(len);
fin.read((char *) word.data(), len);
if(i >= 131 && i <= 258) {
// seems to be unprintable characters list in this range
// TODO maybe they are supposed to be byte reversed or some magic
continue;
}
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;

Loading…
Cancel
Save