work towards tokenizer integration

pull/66/head
beiller 1 year ago
parent c80e2a8f2a
commit 96dc6a0c68
No known key found for this signature in database
GPG Key ID: 5AC5D1B01D0E5D75

@ -31,9 +31,8 @@ endif
#
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
@ -188,7 +187,7 @@ clean:
rm -f *.o main quantize
main: main.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
./main -h
quantize: quantize.cpp ggml.o utils.o

@ -0,0 +1,12 @@
#https://github.com/google/sentencepiece.git
#9ffb33a14c97c512103be0ee74740099660b39aa
curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
tar xzvf sentencepiece-0.1.97.tar.gz
cd sentencepiece-0.1.97/src
mkdir build
cd build
cmake ..
make sentencepiece-static -j $(nproc)
cd ../..

@ -14,6 +14,12 @@
#include <signal.h>
#include <unistd.h>
#include <sentencepiece_processor.h>
//Tokenizer object
sentencepiece::SentencePieceProcessor processor;
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
@ -758,6 +764,11 @@ void sigint_handler(int signo) {
}
int main(int argc, char ** argv) {
const auto status = processor.Load("models/tokenizer.model");
if (!status.ok()) {
printf("%s", status.ToString().c_str());
// error
}
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp;
processor.Encode(params.prompt, &embd_inp);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
@ -935,14 +947,20 @@ int main(int argc, char ** argv) {
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
if(check != "<EFBFBD>") { // ensure a multi-byte token is finished generating before outputting the text
std::string text;
processor.Decode(all_tokens, &text);
std::string chunk = text.substr(full_text.length());
printf("%s", chunk.c_str());
full_text += chunk;
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
}
fflush(stdout);
}
fflush(stdout);
}
// in interactive mode, and not currently processing queued inputs;

Loading…
Cancel
Save