From 86e967c54b126312090ac540585e9903b4d28efa Mon Sep 17 00:00:00 2001
From: wizard <rcvbuf@gmail.com>
Date: Mon, 13 Mar 2023 13:06:01 +0800
Subject: [PATCH] buffering output for UTF-8 encoded token

---
 utils.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/utils.cpp b/utils.cpp
index 8a75039..17cd917 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -544,6 +544,9 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
 void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd)
 {
+    // std::string output = sp.DecodeIds(embd);
+    // printf("%s", output.c_str());
+    // return;
             // Convert the IDs in embd to tokens using SentencePiece
     // std::vector<gpt_vocab::id> pieces;
     // for (const auto& id : embd) {
@@ -575,6 +578,7 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
     // sp.DecodeIds(pieces);
 
     //     printf("%s", text.c_str());
+    std::string buff;
         for (auto id : embd) {
             std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
             
@@ -589,16 +593,27 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
                 std::bitset<8> binary_value(decimal_value);
                 
                 char* bytes = reinterpret_cast<char*>(&decimal_value);
-                printf("%s", bytes);
+                buff = buff + std::string(bytes);
+                //printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str());
             }
             else if(s.find("▁") == 0)
             {
+                if(!buff.empty())
+                {
+                    printf("%s", buff.c_str());
+                    buff = "";
+                }
                 s = std::regex_replace(s, std::regex("▁"), " ");
                 //s.replace(0, 2, 1, ' ');
                 printf("%s", s.c_str());
             }
             else
             {
+                if(!buff.empty())
+                {
+                    printf("%s", buff.c_str());
+                    buff = "";
+                }
                 printf("%s", s.c_str());
             }
         }