From ed10def70e5cb9c5d23f3d21fc239047ba919e29 Mon Sep 17 00:00:00 2001
From: wizard <rcvbuf@gmail.com>
Date: Mon, 13 Mar 2023 17:02:06 +0800
Subject: [PATCH] clean code

---
 main.cpp  |   5 ++-
 utils.cpp | 103 +++++++++++++++---------------------------------------
 2 files changed, 30 insertions(+), 78 deletions(-)

diff --git a/main.cpp b/main.cpp
index 5ad5dc0..820e40a 100644
--- a/main.cpp
+++ b/main.cpp
@@ -886,6 +886,7 @@ int main(int argc, char ** argv) {
         printf(ANSI_COLOR_YELLOW);
     }
 
+    // buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete.
     std::vector<gpt_vocab::id> buffids = {};
     while (remaining_tokens > 0) {
         // predict
@@ -949,9 +950,7 @@ int main(int argc, char ** argv) {
         // display text
         if (!input_noecho) {
             untokenize(sp, buffids, embd);
-            // for (auto id : embd) {
-            //     printf("%s", vocab.id_to_token[id].c_str());
-            // }
+
             // reset color to default if we there is no pending user input
             if (params.use_color && embd_inp.size() <= input_consumed) {
                 printf(ANSI_COLOR_RESET);
diff --git a/utils.cpp b/utils.cpp
index 87ee980..36cb95e 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -542,85 +542,38 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
     return (n/k)*row_size;
 }
 
-void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd)
+void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector<gpt_vocab::id> &buffids, std::vector<gpt_vocab::id> &embd)
 {
-    // std::string output = sp.DecodeIds(embd);
-    // printf("%s", output.c_str());
-    // return;
-            // Convert the IDs in embd to tokens using SentencePiece
-    // std::vector<gpt_vocab::id> pieces;
-    // for (const auto& id : embd) {
-    //    //std::string s = sp.DecodeIds(id);
-
-    //     //s = std::regex_replace(s, std::regex("▁"), " ");
-
-    //     // if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
-    //     // {
-    //     //     s = sp.IdToPiece(id);
-    //     // }
-    //     //printf("%s", s.c_str());
-
-    //     pieces.push_back(id);
-    //     // if(s.length() > 1)
-    //     //     tokens.push_back(" ");
-    // }
-    // // Insert spaces between tokens
-    // // std::string text;
-    // // for (const auto& token : tokens) {
-    // //     // Add a space before the token if it is not the first token and it doesn't start with a special character
-    // //     if (!text.empty() && !(token[0] == '\0x25' && token[1] == '\0x81') && token[0] != ' ') {
-    // //         text += ' ';
-    // //     }
-    // //     text += sp.DecodePieces(tokens);
-    // // }
-    // //sp.DecodeIds(embd);
-    // std::string text =
-    // sp.DecodeIds(pieces);
-
-    //     printf("%s", text.c_str());
-    
-    std::string buff;
-        for (auto id : embd) {
-            std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
-             
-            if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
-            {
-                buffids.push_back(id);
-                // Extract the hexadecimal value from the token
-                std::string hex_value = s.substr(s.find("0x"));
-
-                // Convert the hexadecimal value to binary and print it
-                int decimal_value;
-                std::stringstream(hex_value) >> std::hex >> decimal_value;
-                std::bitset<8> binary_value(decimal_value);
-                
-                char* bytes = reinterpret_cast<char*>(&decimal_value);
-                buff = buff + std::string(bytes);
-                //printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str());
-            }
-            else if(s.find("▁") == 0)
+    for (auto id : embd)
+    {
+        std::string s = sp.IdToPiece(id); // vocab.id_to_token[id];
+
+        if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
+        {
+            buffids.push_back(id);
+            std::string txt = sp.DecodeIds(buffids);
+            // printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str());
+        }
+        else if (s.find("▁") == 0)
+        {
+            if (!buffids.empty())
             {
-                if(!buff.empty())
-                {
-                    std::string txt = sp.DecodeIds(buffids);
-                    printf("%s", txt.c_str());
-                    buffids.clear();
-                    buff = "";
-                }
-                s = std::regex_replace(s, std::regex("▁"), " ");
-                //s.replace(0, 2, 1, ' ');
-                printf("%s", s.c_str());
+                std::string txt = sp.DecodeIds(buffids);
+                printf("%s", txt.c_str());
+                buffids.clear();
             }
-            else
+            s = std::regex_replace(s, std::regex("▁"), " ");
+            printf("%s", s.c_str());
+        }
+        else
+        {
+            if (!buffids.empty())
             {
-                if(!buff.empty())
-                {
-                    std::string txt = sp.DecodeIds(buffids);
-                    printf("%s", txt.c_str());
-                    buffids.clear();
-                    buff = "";
-                }
-                printf("%s", s.c_str());
+                std::string txt = sp.DecodeIds(buffids);
+                printf("%s", txt.c_str());
+                buffids.clear();
             }
+            printf("%s", s.c_str());
         }
+    }
 }
\ No newline at end of file