diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c10e671..7c10638 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: - name: Build run: | - make + sh build.sh macOS-latest: runs-on: macOS-latest @@ -31,7 +31,7 @@ jobs: - name: Build run: | - make + sh build.sh # ubuntu-latest-gcc: # runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 5eb1ff1..57256fb 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ models/* arm_neon.h compile_commands.json +deps diff --git a/Makefile b/Makefile index 8388c29..05a2039 100644 --- a/Makefile +++ b/Makefile @@ -31,9 +31,8 @@ endif # CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -LDFLAGS = - +CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++17 -fPIC +LDFLAGS = # OS specific # TODO: support Windows ifeq ($(UNAME_S),Linux) @@ -188,7 +187,7 @@ clean: rm -f *.o main quantize main: main.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) + $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS) ./main -h quantize: quantize.cpp ggml.o utils.o diff --git a/README.md b/README.md index 3a6d757..0dff35f 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model: # build this repo git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -make +./build.sh # obtain the original LLaMA model weights and place them in ./models ls ./models diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..500366d --- /dev/null +++ b/build.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +if [ ! -d deps ] +then + mkdir deps +fi +cd deps +if [ ! -f v0.1.97.tar.gz ] +then + curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz +fi +if [ ! -f libsentencepiece.a ] +then + tar xzvf v0.1.97.tar.gz + cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build + cmake --version + cmake .. + make sentencepiece-static -j $(nproc) + cd ../.. + cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./ +fi +cd .. +make diff --git a/main.cpp b/main.cpp index 98ccde5..36d0ae4 100644 --- a/main.cpp +++ b/main.cpp @@ -16,6 +16,12 @@ #include #endif +#include + + +//Tokenizer object +sentencepiece::SentencePieceProcessor processor; + #define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_GREEN "\x1b[32m" #define ANSI_COLOR_YELLOW "\x1b[33m" @@ -762,6 +768,11 @@ void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { + const auto status = processor.Load("models/tokenizer.model"); + if (!status.ok()) { + printf("%s", status.ToString().c_str()); + // error + } ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); @@ -811,12 +822,14 @@ int main(int argc, char ** argv) { std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp; + processor.Encode(params.prompt, &embd_inp); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // tokenize the reverse prompt - std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); + std::vector antiprompt_inp; + processor.Encode(params.antiprompt, &antiprompt_inp); printf("\n"); printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -849,6 +862,8 @@ int main(int argc, char ** argv) { printf("\n\n"); std::vector embd; + std::vector all_tokens; + std::string full_text = ""; // determine the required inference memory per token: size_t mem_per_token = 0; @@ -916,6 +931,7 @@ int main(int argc, char ** argv) { last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); + all_tokens.push_back(id); t_sample_us += ggml_time_us() - t_start_sample_us; } @@ -934,6 +950,7 @@ int main(int argc, char ** argv) { embd.push_back(embd_inp[input_consumed]); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(embd_inp[input_consumed]); + all_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; if (embd.size() > params.n_batch) { break; @@ -943,14 +960,28 @@ int main(int argc, char ** argv) { // display text if (!input_noecho) { - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - // reset color to default if we there is no pending user input - if (params.use_color && embd_inp.size() <= input_consumed) { - printf(ANSI_COLOR_RESET); + // check if last token is unprintable token + std::string check; + std::vector check_token; + check_token.push_back(all_tokens.at(all_tokens.size()-1)); + processor.Decode(check_token, &check); + if(check != "�") { + // If the token is printable we wont attempt to print unprintable tokens + std::string text; + processor.Decode(all_tokens, &text); + if(full_text.length() < text.length()) { + std::string chunk = text.substr(full_text.length()); + printf("%s", chunk.c_str()); + full_text.empty(); + processor.Decode(all_tokens, &full_text); + // reset color to default if we there is no pending user input + if (params.use_color && embd_inp.size() <= input_consumed) { + printf(ANSI_COLOR_RESET); + } + fflush(stdout); + } + } - fflush(stdout); } // in interactive mode, and not currently processing queued inputs; @@ -986,7 +1017,8 @@ int main(int argc, char ** argv) { buf[n_read+1] = 0; } - std::vector line_inp = ::llama_tokenize(vocab, buf, false); + std::vector line_inp; + processor.Encode(buf, &antiprompt_inp); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); remaining_tokens -= line_inp.size();