From 15f06f6b4f2074da448851adfc1d887ea7cb76f0 Mon Sep 17 00:00:00 2001
From: wizard <rcvbuf@gmail.com>
Date: Mon, 13 Mar 2023 16:27:11 +0800
Subject: [PATCH] buffering utf-8 output to make it complete for spliting
 output.

---
 main.cpp  |  3 ++-
 utils.cpp | 14 ++++++++++----
 utils.h   |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index d3debfd..5ad5dc0 100644
--- a/main.cpp
+++ b/main.cpp
@@ -886,6 +886,7 @@ int main(int argc, char ** argv) {
         printf(ANSI_COLOR_YELLOW);
     }
 
+    std::vector<gpt_vocab::id> buffids = {};
     while (remaining_tokens > 0) {
         // predict
         if (embd.size() > 0) {
@@ -947,7 +948,7 @@ int main(int argc, char ** argv) {
 
         // display text
         if (!input_noecho) {
-            untokenize(sp, embd);
+            untokenize(sp, buffids, embd);
             // for (auto id : embd) {
             //     printf("%s", vocab.id_to_token[id].c_str());
             // }
diff --git a/utils.cpp b/utils.cpp
index 17cd917..87ee980 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -542,7 +542,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
     return (n/k)*row_size;
 }
 
-void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd)
+void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd)
 {
     // std::string output = sp.DecodeIds(embd);
     // printf("%s", output.c_str());
@@ -578,12 +578,14 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
     // sp.DecodeIds(pieces);
 
     //     printf("%s", text.c_str());
+    
     std::string buff;
         for (auto id : embd) {
             std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
-            
+             
             if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
             {
+                buffids.push_back(id);
                 // Extract the hexadecimal value from the token
                 std::string hex_value = s.substr(s.find("0x"));
 
@@ -600,7 +602,9 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
             {
                 if(!buff.empty())
                 {
-                    printf("%s", buff.c_str());
+                    std::string txt = sp.DecodeIds(buffids);
+                    printf("%s", txt.c_str());
+                    buffids.clear();
                     buff = "";
                 }
                 s = std::regex_replace(s, std::regex("▁"), " ");
@@ -611,7 +615,9 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
             {
                 if(!buff.empty())
                 {
-                    printf("%s", buff.c_str());
+                    std::string txt = sp.DecodeIds(buffids);
+                    printf("%s", txt.c_str());
+                    buffids.clear();
                     buff = "";
                 }
                 printf("%s", s.c_str());
diff --git a/utils.h b/utils.h
index 3904133..07f1c8f 100644
--- a/utils.h
+++ b/utils.h
@@ -105,5 +105,5 @@ void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int
 size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
 size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
 
-void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd);
+void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd);