From 4726e671e66a7c56433343482c7334d4583238e3 Mon Sep 17 00:00:00 2001 From: beiller Date: Sat, 11 Mar 2023 17:13:28 -0500 Subject: [PATCH] Remove Unprintable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #11 This fixes a Japanese prompt I was attempting to run EG: `./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -n 512 -p $'人生の意味は'` Output before change: `人生の意���、フロントカードに���いてる。 2019年3月 © All Rights Reserved. [end of text]` So it is outputting some characters but some � Output after change: `人生の意は、一人が一人ということであります。は安部が立していたので、去からは一人の人にれるのはにとどまったのですが、そう` --- main.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/main.cpp b/main.cpp index 2f47480..67ac8f1 100644 --- a/main.cpp +++ b/main.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // determine number of model parts based on the dimension static const std::map LLAMA_N_PARTS = { @@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } // load vocab + + std::unordered_set unprintable_characters = {"", "�", "��"}; + { const int32_t n_vocab = model.hparams.n_vocab; @@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); + if(unprintable_characters.find(word) != unprintable_characters.end()) { + continue; + } + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word;