From 67b1c842d958b96deb20080f75a681ba045b482f Mon Sep 17 00:00:00 2001 From: beiller Date: Wed, 8 Mar 2023 16:44:50 -0500 Subject: [PATCH] Use sentencepiece tokenization --- .gitignore | 1 + Makefile | 4 ++-- README.md | 2 +- build.sh | 21 +++++++++++++++++++++ build_deps.sh | 12 ------------ main.cpp | 4 ++++ 6 files changed, 29 insertions(+), 15 deletions(-) create mode 100755 build.sh delete mode 100644 build_deps.sh diff --git a/.gitignore b/.gitignore index 5eb1ff1..57256fb 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ models/* arm_neon.h compile_commands.json +deps diff --git a/Makefile b/Makefile index 0b8464c..57a14bc 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ endif # CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC +CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = # OS specific # TODO: support Windows @@ -187,7 +187,7 @@ clean: rm -f *.o main quantize main: main.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS) + $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS) ./main -h quantize: quantize.cpp ggml.o utils.o diff --git a/README.md b/README.md index dd3efae..1f211a0 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model: # build this repo git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -make +./build.sh # obtain the original LLaMA model weights and place them in ./models ls ./models diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..1f9c004 --- /dev/null +++ b/build.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +if [ ! -d deps ] +then + mkdir deps +fi +cd deps +if [ ! -f v0.1.97.tar.gz ] +then + curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz +fi +if [ ! -f libsentencepiece.a ] +then + tar xzvf v0.1.97.tar.gz + cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build && cmake .. + make sentencepiece-static -j $(nproc) + cd ../.. + cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./ +fi +cd .. +make diff --git a/build_deps.sh b/build_deps.sh deleted file mode 100644 index 444d207..0000000 --- a/build_deps.sh +++ /dev/null @@ -1,12 +0,0 @@ -#https://github.com/google/sentencepiece.git -#9ffb33a14c97c512103be0ee74740099660b39aa - -curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz -tar xzvf sentencepiece-0.1.97.tar.gz -cd sentencepiece-0.1.97/src -mkdir build -cd build -cmake .. -make sentencepiece-static -j $(nproc) -cd ../.. - diff --git a/main.cpp b/main.cpp index 7490569..b78b846 100644 --- a/main.cpp +++ b/main.cpp @@ -855,6 +855,8 @@ int main(int argc, char ** argv) { printf("\n\n"); std::vector embd; + std::vector all_tokens; + std::string full_text = ""; // determine the required inference memory per token: size_t mem_per_token = 0; @@ -920,6 +922,7 @@ int main(int argc, char ** argv) { last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); + all_tokens.push_back(id); t_sample_us += ggml_time_us() - t_start_sample_us; } @@ -938,6 +941,7 @@ int main(int argc, char ** argv) { embd.push_back(embd_inp[input_consumed]); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(embd_inp[input_consumed]); + all_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; if (embd.size() > params.n_batch) { break;