Merge ce7ebb3319 into 1808ee0500

2 years ago · 86dbfbd979
parent 1808ee0500 ce7ebb3319
commit 86dbfbd979
6 changed files with 72 additions and 17 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -16,7 +16,7 @@ jobs:

      - name: Build
        run: |
-          make
+          sh build.sh

  macOS-latest:
    runs-on: macOS-latest
@ -31,7 +31,7 @@ jobs:

      - name: Build
        run: |
-          make
+          sh build.sh

 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
--- a/.gitignore
+++ b/.gitignore
@ -21,3 +21,4 @@ models/*

 arm_neon.h
 compile_commands.json
+deps
--- a/7
+++ b/7
@ -31,9 +31,8 @@ endif
 #

 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
-
+CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  = 
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -188,7 +187,7 @@ clean:
 	rm -f *.o main quantize

 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h

 quantize: quantize.cpp ggml.o utils.o
--- a/README.md
+++ b/README.md
@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+./build.sh

 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,23 @@
+#!/bin/sh
+
+if [ ! -d deps ]
+then
+    mkdir deps
+fi
+cd deps
+if [ ! -f v0.1.97.tar.gz ]
+then
+    curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
+fi
+if [ ! -f libsentencepiece.a ]
+then
+    tar xzvf v0.1.97.tar.gz
+    cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build
+    cmake --version
+    cmake ..
+    make sentencepiece-static -j $(nproc)
+    cd ../..
+    cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
+fi
+cd ..
+make
--- a/main.cpp
+++ b/main.cpp
@ -16,6 +16,12 @@
 #include <unistd.h>
 #endif

+#include <sentencepiece_processor.h>
+
+
+//Tokenizer object
+sentencepiece::SentencePieceProcessor processor;
+
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
 #define ANSI_COLOR_YELLOW  "\x1b[33m"
@ -762,6 +768,11 @@ void sigint_handler(int signo) {
 #endif

 int main(int argc, char ** argv) {
+    const auto status = processor.Load("models/tokenizer.model");
+    if (!status.ok()) {
+       printf("%s", status.ToString().c_str());
+       // error
+    }
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();

@ -811,12 +822,14 @@ int main(int argc, char ** argv) {
    std::vector<float> logits;

    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp;
+    processor.Encode(params.prompt, &embd_inp);

    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

    // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp;
+    processor.Encode(params.antiprompt, &antiprompt_inp);

    printf("\n");
    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -849,6 +862,8 @@ int main(int argc, char ** argv) {
    printf("\n\n");

    std::vector<gpt_vocab::id> embd;
+    std::vector<gpt_vocab::id> all_tokens;
+    std::string full_text = "";

    // determine the required inference memory per token:
    size_t mem_per_token = 0;
@ -916,6 +931,7 @@ int main(int argc, char ** argv) {

                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
+                all_tokens.push_back(id);

                t_sample_us += ggml_time_us() - t_start_sample_us;
            }
@ -934,6 +950,7 @@ int main(int argc, char ** argv) {
                embd.push_back(embd_inp[input_consumed]);
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(embd_inp[input_consumed]);
+                all_tokens.push_back(embd_inp[input_consumed]);
                ++input_consumed;
                if (embd.size() > params.n_batch) {
                    break;
@ -943,14 +960,28 @@ int main(int argc, char ** argv) {

        // display text
        if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-            // reset color to default if we there is no pending user input
-            if (params.use_color && embd_inp.size() <= input_consumed) {
-                printf(ANSI_COLOR_RESET);
+            // check if last token is unprintable token
+            std::string check;
+            std::vector<gpt_vocab::id> check_token;
+            check_token.push_back(all_tokens.at(all_tokens.size()-1));
+            processor.Decode(check_token, &check);
+            if(check != "<EFBFBD>") { 
+                // If the token is printable we wont attempt to print unprintable tokens
+                std::string text;
+                processor.Decode(all_tokens, &text);
+                if(full_text.length() < text.length()) {
+                    std::string chunk = text.substr(full_text.length());
+                    printf("%s", chunk.c_str());
+                    full_text.empty();
+                    processor.Decode(all_tokens, &full_text);                    
+                    // reset color to default if we there is no pending user input
+                    if (params.use_color && embd_inp.size() <= input_consumed) {
+                        printf(ANSI_COLOR_RESET);
+                    }
+                    fflush(stdout);
+                }
+
            }
-            fflush(stdout);
        }

        // in interactive mode, and not currently processing queued inputs;
@ -986,7 +1017,8 @@ int main(int argc, char ** argv) {
                        buf[n_read+1] = 0;
                    }

-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp;
+                    processor.Encode(buf, &antiprompt_inp);
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

                    remaining_tokens -= line_inp.size();