|
|
|
@ -16,6 +16,12 @@
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include <sentencepiece_processor.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//Tokenizer object
|
|
|
|
|
sentencepiece::SentencePieceProcessor processor;
|
|
|
|
|
|
|
|
|
|
#define ANSI_COLOR_RED "\x1b[31m"
|
|
|
|
|
#define ANSI_COLOR_GREEN "\x1b[32m"
|
|
|
|
|
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
|
|
|
@ -762,6 +768,11 @@ void sigint_handler(int signo) {
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
|
const auto status = processor.Load("models/tokenizer.model");
|
|
|
|
|
if (!status.ok()) {
|
|
|
|
|
printf("%s", status.ToString().c_str());
|
|
|
|
|
// error
|
|
|
|
|
}
|
|
|
|
|
ggml_time_init();
|
|
|
|
|
const int64_t t_main_start_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
@ -811,12 +822,14 @@ int main(int argc, char ** argv) {
|
|
|
|
|
std::vector<float> logits;
|
|
|
|
|
|
|
|
|
|
// tokenize the prompt
|
|
|
|
|
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
|
|
|
|
std::vector<gpt_vocab::id> embd_inp;
|
|
|
|
|
processor.Encode(params.prompt, &embd_inp);
|
|
|
|
|
|
|
|
|
|
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
|
|
|
|
|
|
|
|
|
// tokenize the reverse prompt
|
|
|
|
|
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
|
|
|
|
std::vector<gpt_vocab::id> antiprompt_inp;
|
|
|
|
|
processor.Encode(params.antiprompt, &antiprompt_inp);
|
|
|
|
|
|
|
|
|
|
printf("\n");
|
|
|
|
|
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
|
|
@ -849,6 +862,8 @@ int main(int argc, char ** argv) {
|
|
|
|
|
printf("\n\n");
|
|
|
|
|
|
|
|
|
|
std::vector<gpt_vocab::id> embd;
|
|
|
|
|
std::vector<gpt_vocab::id> all_tokens;
|
|
|
|
|
std::string full_text = "";
|
|
|
|
|
|
|
|
|
|
// determine the required inference memory per token:
|
|
|
|
|
size_t mem_per_token = 0;
|
|
|
|
@ -916,6 +931,7 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
last_n_tokens.erase(last_n_tokens.begin());
|
|
|
|
|
last_n_tokens.push_back(id);
|
|
|
|
|
all_tokens.push_back(id);
|
|
|
|
|
|
|
|
|
|
t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
|
}
|
|
|
|
@ -934,6 +950,7 @@ int main(int argc, char ** argv) {
|
|
|
|
|
embd.push_back(embd_inp[input_consumed]);
|
|
|
|
|
last_n_tokens.erase(last_n_tokens.begin());
|
|
|
|
|
last_n_tokens.push_back(embd_inp[input_consumed]);
|
|
|
|
|
all_tokens.push_back(embd_inp[input_consumed]);
|
|
|
|
|
++input_consumed;
|
|
|
|
|
if (embd.size() > params.n_batch) {
|
|
|
|
|
break;
|
|
|
|
@ -943,14 +960,28 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
// display text
|
|
|
|
|
if (!input_noecho) {
|
|
|
|
|
for (auto id : embd) {
|
|
|
|
|
printf("%s", vocab.id_to_token[id].c_str());
|
|
|
|
|
}
|
|
|
|
|
// reset color to default if we there is no pending user input
|
|
|
|
|
if (params.use_color && embd_inp.size() <= input_consumed) {
|
|
|
|
|
printf(ANSI_COLOR_RESET);
|
|
|
|
|
// check if last token is unprintable token
|
|
|
|
|
std::string check;
|
|
|
|
|
std::vector<gpt_vocab::id> check_token;
|
|
|
|
|
check_token.push_back(all_tokens.at(all_tokens.size()-1));
|
|
|
|
|
processor.Decode(check_token, &check);
|
|
|
|
|
if(check != "<EFBFBD>") {
|
|
|
|
|
// If the token is printable we wont attempt to print unprintable tokens
|
|
|
|
|
std::string text;
|
|
|
|
|
processor.Decode(all_tokens, &text);
|
|
|
|
|
if(full_text.length() < text.length()) {
|
|
|
|
|
std::string chunk = text.substr(full_text.length());
|
|
|
|
|
printf("%s", chunk.c_str());
|
|
|
|
|
full_text.empty();
|
|
|
|
|
processor.Decode(all_tokens, &full_text);
|
|
|
|
|
// reset color to default if we there is no pending user input
|
|
|
|
|
if (params.use_color && embd_inp.size() <= input_consumed) {
|
|
|
|
|
printf(ANSI_COLOR_RESET);
|
|
|
|
|
}
|
|
|
|
|
fflush(stdout);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
fflush(stdout);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// in interactive mode, and not currently processing queued inputs;
|
|
|
|
@ -986,7 +1017,8 @@ int main(int argc, char ** argv) {
|
|
|
|
|
buf[n_read+1] = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
|
|
|
|
|
std::vector<gpt_vocab::id> line_inp;
|
|
|
|
|
processor.Encode(buf, &antiprompt_inp);
|
|
|
|
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
|
|
|
|
|
|
|
|
remaining_tokens -= line_inp.size();
|
|
|
|
|