From ed10def70e5cb9c5d23f3d21fc239047ba919e29 Mon Sep 17 00:00:00 2001 From: wizard Date: Mon, 13 Mar 2023 17:02:06 +0800 Subject: [PATCH] clean code --- main.cpp | 5 ++- utils.cpp | 103 +++++++++++++++--------------------------------------- 2 files changed, 30 insertions(+), 78 deletions(-) diff --git a/main.cpp b/main.cpp index 5ad5dc0..820e40a 100644 --- a/main.cpp +++ b/main.cpp @@ -886,6 +886,7 @@ int main(int argc, char ** argv) { printf(ANSI_COLOR_YELLOW); } + // buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete. std::vector buffids = {}; while (remaining_tokens > 0) { // predict @@ -949,9 +950,7 @@ int main(int argc, char ** argv) { // display text if (!input_noecho) { untokenize(sp, buffids, embd); - // for (auto id : embd) { - // printf("%s", vocab.id_to_token[id].c_str()); - // } + // reset color to default if we there is no pending user input if (params.use_color && embd_inp.size() <= input_consumed) { printf(ANSI_COLOR_RESET); diff --git a/utils.cpp b/utils.cpp index 87ee980..36cb95e 100644 --- a/utils.cpp +++ b/utils.cpp @@ -542,85 +542,38 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t return (n/k)*row_size; } -void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector & buffids, std::vector & embd) +void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector &buffids, std::vector &embd) { - // std::string output = sp.DecodeIds(embd); - // printf("%s", output.c_str()); - // return; - // Convert the IDs in embd to tokens using SentencePiece - // std::vector pieces; - // for (const auto& id : embd) { - // //std::string s = sp.DecodeIds(id); - - // //s = std::regex_replace(s, std::regex("▁"), " "); - - // // if (s.find("<0x") == 0 && s[s.length() - 1] == '>') - // // { - // // s = sp.IdToPiece(id); - // // } - // //printf("%s", s.c_str()); - - // pieces.push_back(id); - // // if(s.length() > 1) - // // tokens.push_back(" "); - // } - // // Insert spaces between tokens - // // std::string text; - // // for (const auto& token : tokens) { - // // // Add a space before the token if it is not the first token and it doesn't start with a special character - // // if (!text.empty() && !(token[0] == '\0x25' && token[1] == '\0x81') && token[0] != ' ') { - // // text += ' '; - // // } - // // text += sp.DecodePieces(tokens); - // // } - // //sp.DecodeIds(embd); - // std::string text = - // sp.DecodeIds(pieces); - - // printf("%s", text.c_str()); - - std::string buff; - for (auto id : embd) { - std::string s = sp.IdToPiece(id); //vocab.id_to_token[id]; - - if (s.find("<0x") == 0 && s[s.length() - 1] == '>') - { - buffids.push_back(id); - // Extract the hexadecimal value from the token - std::string hex_value = s.substr(s.find("0x")); - - // Convert the hexadecimal value to binary and print it - int decimal_value; - std::stringstream(hex_value) >> std::hex >> decimal_value; - std::bitset<8> binary_value(decimal_value); - - char* bytes = reinterpret_cast(&decimal_value); - buff = buff + std::string(bytes); - //printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str()); - } - else if(s.find("▁") == 0) + for (auto id : embd) + { + std::string s = sp.IdToPiece(id); // vocab.id_to_token[id]; + + if (s.find("<0x") == 0 && s[s.length() - 1] == '>') + { + buffids.push_back(id); + std::string txt = sp.DecodeIds(buffids); + // printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str()); + } + else if (s.find("▁") == 0) + { + if (!buffids.empty()) { - if(!buff.empty()) - { - std::string txt = sp.DecodeIds(buffids); - printf("%s", txt.c_str()); - buffids.clear(); - buff = ""; - } - s = std::regex_replace(s, std::regex("▁"), " "); - //s.replace(0, 2, 1, ' '); - printf("%s", s.c_str()); + std::string txt = sp.DecodeIds(buffids); + printf("%s", txt.c_str()); + buffids.clear(); } - else + s = std::regex_replace(s, std::regex("▁"), " "); + printf("%s", s.c_str()); + } + else + { + if (!buffids.empty()) { - if(!buff.empty()) - { - std::string txt = sp.DecodeIds(buffids); - printf("%s", txt.c_str()); - buffids.clear(); - buff = ""; - } - printf("%s", s.c_str()); + std::string txt = sp.DecodeIds(buffids); + printf("%s", txt.c_str()); + buffids.clear(); } + printf("%s", s.c_str()); } + } } \ No newline at end of file