From 86e967c54b126312090ac540585e9903b4d28efa Mon Sep 17 00:00:00 2001 From: wizard Date: Mon, 13 Mar 2023 13:06:01 +0800 Subject: [PATCH] buffering output for UTF-8 encoded token --- utils.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/utils.cpp b/utils.cpp index 8a75039..17cd917 100644 --- a/utils.cpp +++ b/utils.cpp @@ -544,6 +544,9 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector & embd) { + // std::string output = sp.DecodeIds(embd); + // printf("%s", output.c_str()); + // return; // Convert the IDs in embd to tokens using SentencePiece // std::vector pieces; // for (const auto& id : embd) { @@ -575,6 +578,7 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector binary_value(decimal_value); char* bytes = reinterpret_cast(&decimal_value); - printf("%s", bytes); + buff = buff + std::string(bytes); + //printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str()); } else if(s.find("▁") == 0) { + if(!buff.empty()) + { + printf("%s", buff.c_str()); + buff = ""; + } s = std::regex_replace(s, std::regex("▁"), " "); //s.replace(0, 2, 1, ' '); printf("%s", s.c_str()); } else { + if(!buff.empty()) + { + printf("%s", buff.c_str()); + buff = ""; + } printf("%s", s.c_str()); } }