pull/87/merge
wizd 1 year ago committed by GitHub
commit c0d4646dd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -21,3 +21,4 @@ models/*
arm_neon.h
compile_commands.json
*.dSYM/

@ -30,9 +30,9 @@ endif
# Compile flags
#
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -g -I/opt/homebrew/include
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -I/opt/homebrew/include
LDFLAGS = -L/opt/homebrew/lib -lsentencepiece
# OS specific
# TODO: support Windows

@ -10,6 +10,12 @@
#include <map>
#include <string>
#include <vector>
#include <sentencepiece_processor.h>
#include <stdexcept>
#include <iostream>
#include <bitset>
#include <sstream>
#include <regex>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
@ -84,7 +90,7 @@ struct llama_model {
};
// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
bool llama_model_load(const std::string & fname, llama_model & model, sentencepiece::SentencePieceProcessor & sp, gpt_vocab & vocab, int n_ctx) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -146,6 +152,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
return false;
}
printf("total pieces: %d", sp.GetPieceSize());
std::string word;
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
@ -154,8 +162,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
std::string wordx = sp.IdToPiece(i);
vocab.token_to_id[wordx] = i;
vocab.id_to_token[i] = wordx;
//if (i < 30000) {
// printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
@ -768,6 +777,9 @@ int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
@ -795,7 +807,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
if (!llama_model_load(params.model, model, sp, vocab, 512)) { // TODO: set context from user input ??
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
@ -882,6 +894,8 @@ int main(int argc, char ** argv) {
printf(ANSI_COLOR_YELLOW);
}
// buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete.
std::vector<gpt_vocab::id> buffids = {};
while (remaining_tokens > 0) {
// predict
if (embd.size() > 0) {
@ -943,9 +957,8 @@ int main(int argc, char ** argv) {
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
untokenize(sp, buffids, embd);
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);

@ -4,6 +4,8 @@
#include <cstring>
#include <fstream>
#include <regex>
#include <sentencepiece_processor.h>
#include <sstream>
#include <iostream>
#include <iterator>
#include <string>
@ -281,33 +283,30 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
std::vector<gpt_vocab::id> res;
if (bos) {
res.push_back(1); // TODO: replace with vocab.bos
}
// if (bos) {
// res.push_back(1); // TODO: replace with vocab.bos
// }
//find the longest token that matches the text
int pos = 0;
while (true) {
int l = 0;
int t = 0;
for (const auto & kv : vocab.id_to_token) {
if (kv.second.size() < l) continue;
if (kv.second.size() > text.size() - pos) continue;
if (text.substr(pos, kv.second.size()) == kv.second) {
l = kv.second.size();
t = kv.first;
}
}
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
if (l == 0) {
break;
std::vector<std::string> pieces;
return sp.EncodeAsIds(text);
/*
for (const auto & piece : pieces) {
printf("piece: %s\n", piece.c_str());
if (vocab.token_to_id.count(piece) > 0) {
res.push_back(vocab.token_to_id.at(piece));
} else {
// handle unknown token
}
}
res.push_back(t);
pos += l;
for (const auto& id : res) {
printf("%d\n", id);
}
return res;
return res;*/
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
@ -542,3 +541,39 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
return (n/k)*row_size;
}
void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector<gpt_vocab::id> &buffids, std::vector<gpt_vocab::id> &embd)
{
for (auto id : embd)
{
std::string s = sp.IdToPiece(id); // vocab.id_to_token[id];
if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
{
buffids.push_back(id);
std::string txt = sp.DecodeIds(buffids);
// printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str());
}
else if (s.find("") == 0)
{
if (!buffids.empty())
{
std::string txt = sp.DecodeIds(buffids);
printf("%s", txt.c_str());
buffids.clear();
}
s = std::regex_replace(s, std::regex(""), " ");
printf("%s", s.c_str());
}
else
{
if (!buffids.empty())
{
std::string txt = sp.DecodeIds(buffids);
printf("%s", txt.c_str());
buffids.clear();
}
printf("%s", s.c_str());
}
}
}

@ -7,6 +7,8 @@
#include <vector>
#include <random>
#include <thread>
#include <sentencepiece_processor.h>
#include <sstream>
//
// CLI argument parsing
@ -102,3 +104,6 @@ void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd);

Loading…
Cancel
Save