Remove Unprintable

Fixes #11 

This fixes a Japanese prompt I was attempting to run

EG:

`./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -n 512 -p $'人生の意味は'`

Output before change:

`人生の意���、フロントカードに���いてる。 2019年3月 © All Rights Reserved. [end of text]`

So it is outputting some characters but some �

Output after change:

`人生の意は、一人が一人ということであります。は安部が立していたので、去からは一人の人にれるのはにとどまったのですが、そう`
pull/26/head
beiller 2 years ago committed by GitHub
parent 4235e3d5b3
commit 4726e671e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -10,6 +10,7 @@
#include <map> #include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_set>
// determine number of model parts based on the dimension // determine number of model parts based on the dimension
static const std::map<int, int> LLAMA_N_PARTS = { static const std::map<int, int> LLAMA_N_PARTS = {
@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
} }
// load vocab // load vocab
std::unordered_set<std::string> unprintable_characters = {"", "<EFBFBD>", "<EFBFBD><EFBFBD>"};
{ {
const int32_t n_vocab = model.hparams.n_vocab; const int32_t n_vocab = model.hparams.n_vocab;
@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
word.resize(len); word.resize(len);
fin.read((char *) word.data(), len); fin.read((char *) word.data(), len);
if(unprintable_characters.find(word) != unprintable_characters.end()) {
continue;
}
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word; vocab.id_to_token[i] = word;

Loading…
Cancel
Save