|
|
|
@ -473,56 +473,15 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// compute similarity between two strings using Levenshtein distance
|
|
|
|
|
float similarity(const std::string & s0, const std::string & s1) {
|
|
|
|
|
const size_t len0 = s0.size() + 1;
|
|
|
|
|
const size_t len1 = s1.size() + 1;
|
|
|
|
|
|
|
|
|
|
std::vector<int> col(len1, 0);
|
|
|
|
|
std::vector<int> prevCol(len1, 0);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < len1; i++) {
|
|
|
|
|
prevCol[i] = i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < len0; i++) {
|
|
|
|
|
col[0] = i;
|
|
|
|
|
for (size_t j = 1; j < len1; j++) {
|
|
|
|
|
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
|
|
|
|
|
}
|
|
|
|
|
col.swap(prevCol);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const float dist = prevCol[len1 - 1];
|
|
|
|
|
|
|
|
|
|
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// generated with ChatGPT
|
|
|
|
|
std::map<std::string, std::string> k_prompts = {
|
|
|
|
|
{ "Santa",
|
|
|
|
|
R"(Kid: Hi Santa! Are you real?
|
|
|
|
|
Santa: Of course I am, my dear! Ho ho ho!
|
|
|
|
|
Kid: Can you please bring me a new toy for Christmas?
|
|
|
|
|
Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
|
|
|
|
|
Kid: I will, Santa! Thank you!
|
|
|
|
|
Santa: You're welcome, little one. Merry Christmas! Ho ho ho!
|
|
|
|
|
Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
|
|
|
|
|
Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
|
|
|
|
|
Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
|
|
|
|
|
Santa: I'm sorry, but only good boys and girls get to ride in my sleigh.
|
|
|
|
|
)" },
|
|
|
|
|
{ "Kid",
|
|
|
|
|
R"(Kid: Hi Santa! Are you real?
|
|
|
|
|
Santa: Of course I am, my dear! Ho ho ho!
|
|
|
|
|
Kid: Can you please bring me a new toy for Christmas?
|
|
|
|
|
Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
|
|
|
|
|
Kid: I will, Santa! Thank you!
|
|
|
|
|
Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
|
|
|
|
|
Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
|
|
|
|
|
Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
|
|
|
|
|
)" },
|
|
|
|
|
};
|
|
|
|
|
const std::string k_prompt =
|
|
|
|
|
R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:
|
|
|
|
|
|
|
|
|
|
B: Hello {0}, how are you?
|
|
|
|
|
A: I'm fine, thank you.
|
|
|
|
|
{1}
|
|
|
|
|
Here is how {0} (A) continues the dialogue:
|
|
|
|
|
|
|
|
|
|
A:)";
|
|
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
|
whisper_params params;
|
|
|
|
@ -579,7 +538,7 @@ int main(int argc, char ** argv) {
|
|
|
|
|
int n_iter = 0;
|
|
|
|
|
|
|
|
|
|
bool is_running = true;
|
|
|
|
|
bool force_speak = params.person == "Kid";
|
|
|
|
|
bool force_speak = false;
|
|
|
|
|
|
|
|
|
|
float prob0 = 0.0f;
|
|
|
|
|
float prob = 0.0f;
|
|
|
|
@ -587,19 +546,13 @@ int main(int argc, char ** argv) {
|
|
|
|
|
std::vector<float> pcmf32_cur;
|
|
|
|
|
std::vector<float> pcmf32_prompt;
|
|
|
|
|
|
|
|
|
|
if (k_prompts.find(params.person) == k_prompts.end()) {
|
|
|
|
|
fprintf(stderr, "%s: unknown person '%s'\n", __func__, params.person.c_str());
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
gpt2_set_prompt(ctx_gpt, "");
|
|
|
|
|
|
|
|
|
|
gpt2_set_prompt(ctx_gpt, k_prompts.at(params.person).c_str());
|
|
|
|
|
const int voice_id = rand()%6;
|
|
|
|
|
|
|
|
|
|
const std::string person_other = params.person == "Santa" ? "Kid" : "Santa";
|
|
|
|
|
const int voice_id = params.person == "Santa" ? 5 : 2;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "gpt-2: prompt_base:\n");
|
|
|
|
|
fprintf(stderr, "gpt-2: prompt:\n");
|
|
|
|
|
fprintf(stderr, "========================\n\n");
|
|
|
|
|
fprintf(stderr, "%s\n", gpt2_get_prompt(ctx_gpt));
|
|
|
|
|
fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
|
|
|
|
|
fprintf(stderr, "========================\n\n");
|
|
|
|
|
|
|
|
|
|
// main loop
|
|
|
|
@ -636,13 +589,12 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
audio.get(params.voice_ms, pcmf32_cur);
|
|
|
|
|
|
|
|
|
|
std::string text_heard = "Hey little one, what do you want for Christmas?";
|
|
|
|
|
std::string text_heard = "";
|
|
|
|
|
|
|
|
|
|
if (!force_speak) {
|
|
|
|
|
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
force_speak = false;
|
|
|
|
|
|
|
|
|
|
// remove text between brackets using regex
|
|
|
|
|
{
|
|
|
|
|
std::regex re("\\[.*?\\]");
|
|
|
|
@ -667,13 +619,15 @@ int main(int argc, char ** argv) {
|
|
|
|
|
|
|
|
|
|
const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
|
|
|
|
|
|
|
|
|
|
if (text_heard.empty() || tokens.empty()) {
|
|
|
|
|
if (text_heard.empty() || tokens.empty() || force_speak) {
|
|
|
|
|
fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
|
|
|
|
|
audio.clear();
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
force_speak = false;
|
|
|
|
|
|
|
|
|
|
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
|
|
|
|
|
|
|
|
|
|
std::string prompt_base = gpt2_get_prompt(ctx_gpt);
|
|
|
|
@ -681,9 +635,11 @@ int main(int argc, char ** argv) {
|
|
|
|
|
std::string text_to_speak;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
text_heard = person_other + ": " + text_heard;
|
|
|
|
|
prompt_base += "B: " + text_heard + "\n";
|
|
|
|
|
|
|
|
|
|
text_to_speak = gpt2_gen_text(ctx_gpt, (prompt_base + text_heard + "\n").c_str(), params.max_tokens);
|
|
|
|
|
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
|
|
|
|
|
|
|
|
|
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
|
|
|
|
|
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
|
|
|
|
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
|
|
|
|
|
|
|
|
|
@ -703,13 +659,20 @@ int main(int argc, char ** argv) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prompt_base += text_heard + "\n" + text_to_speak + "\n";
|
|
|
|
|
}
|
|
|
|
|
prompt_base += "A:" + text_to_speak + "\n";
|
|
|
|
|
|
|
|
|
|
printf("%s\n", text_to_speak.c_str());
|
|
|
|
|
{
|
|
|
|
|
prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
|
|
|
|
|
|
|
|
|
printf("===============\n");
|
|
|
|
|
printf("prompt:\n");
|
|
|
|
|
printf("%s\n", prompt.c_str());
|
|
|
|
|
printf("===============\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//printf("========================\n");
|
|
|
|
|
//printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
|
|
|
|
|
//printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
|
|
|
|
|
//printf("========================\n");
|
|
|
|
|
|
|
|
|
|
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
|
|
|
|