diff --git a/README.md b/README.md index 87808fd..d2b9a70 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1 In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that - I don't know yet how much the quantization affects the quality of the generated text - Probably the token sampling can be improved -- No Windows support - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon - + diff --git a/main.cpp b/main.cpp index fb9eb17..982adf1 100644 --- a/main.cpp +++ b/main.cpp @@ -728,6 +728,7 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { + printf(" [end of text]\n"); break; } } diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/utils.cpp b/utils.cpp index 70a2ac2..cd9c001 100644 --- a/utils.cpp +++ b/utils.cpp @@ -231,39 +231,39 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri } std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - auto res = gpt_tokenize(vocab, text); + //auto res = gpt_tokenize(vocab, text); + + //if (bos) { + // res.insert(res.begin(), 1); // TODO: replace with vocab.bos + //} + + std::vector res; if (bos) { - res.insert(res.begin(), 1); // TODO: replace with vocab.bos + res.push_back(1); // TODO: replace with vocab.bos } - //std::vector res; + //find the longest token that matches the text + int pos = 0; + while (true) { + int l = 0; + int t = 0; + for (const auto & kv : vocab.id_to_token) { + if (kv.second.size() < l) continue; + if (kv.second.size() > text.size() - pos) continue; + if (text.substr(pos, kv.second.size()) == kv.second) { + l = kv.second.size(); + t = kv.first; + } + } - //if (bos) { - // res.push_back(1); // TODO: replace with vocab.bos - //} + if (l == 0 && t != 13) { + break; + } - // find the longest token that matches the text - //int pos = 0; - //while (true) { - // int l = 0; - // int t = 0; - // for (const auto & kv : vocab.id_to_token) { - // if (kv.second.size() < l) continue; - // if (kv.second.size() > text.size() - pos) continue; - // if (text.substr(pos, kv.second.size()) == kv.second) { - // l = kv.second.size(); - // t = kv.first; - // } - // } - - // if (l == 0 && t != 13) { - // break; - // } - - // res.push_back(t); - // pos += l; - //} + res.push_back(t); + pos += l; + } return res; } diff --git a/utils.h b/utils.h index d291964..20c42ba 100644 --- a/utils.h +++ b/utils.h @@ -15,12 +15,12 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 200; // new tokens to predict + int32_t n_predict = 128; // new tokens to predict // sampling parameters - int32_t top_k = 100; + int32_t top_k = 40; float top_p = 0.95f; - float temp = 0.8f; + float temp = 0.80f; int32_t n_batch = 8; // batch size for prompt processing