pull/66/merge
beiller 1 year ago committed by GitHub
commit 86dbfbd979
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,7 +16,7 @@ jobs:
- name: Build
run: |
make
sh build.sh
macOS-latest:
runs-on: macOS-latest
@ -31,7 +31,7 @@ jobs:
- name: Build
run: |
make
sh build.sh
# ubuntu-latest-gcc:
# runs-on: ubuntu-latest

1
.gitignore vendored

@ -21,3 +21,4 @@ models/*
arm_neon.h
compile_commands.json
deps

@ -31,9 +31,8 @@ endif
#
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS =
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
@ -188,7 +187,7 @@ clean:
rm -f *.o main quantize
main: main.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
./main -h
quantize: quantize.cpp ggml.o utils.o

@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
# build this repo
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
./build.sh
# obtain the original LLaMA model weights and place them in ./models
ls ./models

@ -0,0 +1,23 @@
#!/bin/sh
if [ ! -d deps ]
then
mkdir deps
fi
cd deps
if [ ! -f v0.1.97.tar.gz ]
then
curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
fi
if [ ! -f libsentencepiece.a ]
then
tar xzvf v0.1.97.tar.gz
cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build
cmake --version
cmake ..
make sentencepiece-static -j $(nproc)
cd ../..
cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
fi
cd ..
make

@ -16,6 +16,12 @@
#include <unistd.h>
#endif
#include <sentencepiece_processor.h>
//Tokenizer object
sentencepiece::SentencePieceProcessor processor;
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
@ -762,6 +768,11 @@ void sigint_handler(int signo) {
#endif
int main(int argc, char ** argv) {
const auto status = processor.Load("models/tokenizer.model");
if (!status.ok()) {
printf("%s", status.ToString().c_str());
// error
}
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
@ -811,12 +822,14 @@ int main(int argc, char ** argv) {
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp;
processor.Encode(params.prompt, &embd_inp);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<gpt_vocab::id> antiprompt_inp;
processor.Encode(params.antiprompt, &antiprompt_inp);
printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -849,6 +862,8 @@ int main(int argc, char ** argv) {
printf("\n\n");
std::vector<gpt_vocab::id> embd;
std::vector<gpt_vocab::id> all_tokens;
std::string full_text = "";
// determine the required inference memory per token:
size_t mem_per_token = 0;
@ -916,6 +931,7 @@ int main(int argc, char ** argv) {
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
all_tokens.push_back(id);
t_sample_us += ggml_time_us() - t_start_sample_us;
}
@ -934,6 +950,7 @@ int main(int argc, char ** argv) {
embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]);
all_tokens.push_back(embd_inp[input_consumed]);
++input_consumed;
if (embd.size() > params.n_batch) {
break;
@ -943,14 +960,28 @@ int main(int argc, char ** argv) {
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
// check if last token is unprintable token
std::string check;
std::vector<gpt_vocab::id> check_token;
check_token.push_back(all_tokens.at(all_tokens.size()-1));
processor.Decode(check_token, &check);
if(check != "<EFBFBD>") {
// If the token is printable we wont attempt to print unprintable tokens
std::string text;
processor.Decode(all_tokens, &text);
if(full_text.length() < text.length()) {
std::string chunk = text.substr(full_text.length());
printf("%s", chunk.c_str());
full_text.empty();
processor.Decode(all_tokens, &full_text);
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
}
fflush(stdout);
}
}
fflush(stdout);
}
// in interactive mode, and not currently processing queued inputs;
@ -986,7 +1017,8 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0;
}
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
std::vector<gpt_vocab::id> line_inp;
processor.Encode(buf, &antiprompt_inp);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
remaining_tokens -= line_inp.size();

Loading…
Cancel
Save