|
|
|
@ -11,7 +11,6 @@
|
|
|
|
|
#include <SDL.h>
|
|
|
|
|
#include <SDL_audio.h>
|
|
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <sstream>
|
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <cstdio>
|
|
|
|
@ -515,6 +514,35 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
|
|
|
|
|
return allowed_commands;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> get_words(const std::string &txt) {
|
|
|
|
|
std::vector<std::string> words;
|
|
|
|
|
|
|
|
|
|
std::istringstream iss(txt);
|
|
|
|
|
std::string word;
|
|
|
|
|
while (iss >> word) {
|
|
|
|
|
words.push_back(word);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return words;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// returns true if no exit event was received
|
|
|
|
|
bool process_sdl_events() {
|
|
|
|
|
SDL_Event event;
|
|
|
|
|
while (SDL_PollEvent(&event)) {
|
|
|
|
|
switch (event.type) {
|
|
|
|
|
case SDL_QUIT:
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// command-list mode
|
|
|
|
|
// guide the transcription to match the most likely command from a provided list
|
|
|
|
|
int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
|
|
|
@ -606,23 +634,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
|
|
|
|
|
// main loop
|
|
|
|
|
while (is_running) {
|
|
|
|
|
// handle Ctrl + C
|
|
|
|
|
{
|
|
|
|
|
SDL_Event event;
|
|
|
|
|
while (SDL_PollEvent(&event)) {
|
|
|
|
|
switch (event.type) {
|
|
|
|
|
case SDL_QUIT:
|
|
|
|
|
{
|
|
|
|
|
is_running = false;
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!is_running) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
is_running = process_sdl_events();
|
|
|
|
|
|
|
|
|
|
// delay
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
@ -718,6 +730,84 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// always-prompt mode
|
|
|
|
|
// transcribe the voice into text after valid prompt
|
|
|
|
|
int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
|
|
|
|
|
bool is_running = true;
|
|
|
|
|
bool ask_prompt = true;
|
|
|
|
|
|
|
|
|
|
float prob = 0.0f;
|
|
|
|
|
|
|
|
|
|
std::vector<float> pcmf32_cur;
|
|
|
|
|
|
|
|
|
|
const std::string k_prompt = params.prompt;
|
|
|
|
|
|
|
|
|
|
const int k_prompt_length = get_words(k_prompt).size();
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
fprintf(stderr, "%s: always-prompt mode\n", __func__);
|
|
|
|
|
|
|
|
|
|
// main loop
|
|
|
|
|
while (is_running) {
|
|
|
|
|
// handle Ctrl + C
|
|
|
|
|
is_running = process_sdl_events();
|
|
|
|
|
|
|
|
|
|
// delay
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
|
|
|
|
|
|
if (ask_prompt) {
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
|
|
|
|
|
ask_prompt = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
audio.get(2000, pcmf32_cur);
|
|
|
|
|
|
|
|
|
|
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
|
|
|
|
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
|
|
|
|
|
|
|
|
|
int64_t t_ms = 0;
|
|
|
|
|
|
|
|
|
|
// detect the commands
|
|
|
|
|
audio.get(params.command_ms, pcmf32_cur);
|
|
|
|
|
|
|
|
|
|
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
|
|
|
|
|
|
|
|
|
|
const auto words = get_words(txt);
|
|
|
|
|
|
|
|
|
|
std::string prompt;
|
|
|
|
|
std::string command;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < words.size(); ++i) {
|
|
|
|
|
if (i < k_prompt_length) {
|
|
|
|
|
prompt += words[i] + " ";
|
|
|
|
|
} else {
|
|
|
|
|
command += words[i] + " ";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const float sim = similarity(prompt, k_prompt);
|
|
|
|
|
|
|
|
|
|
//debug
|
|
|
|
|
//fprintf(stdout, "command size: %i\n", command_length);
|
|
|
|
|
|
|
|
|
|
if ((sim > 0.7f) && (command.size() > 0)) {
|
|
|
|
|
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
|
|
|
|
|
audio.clear();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// general-purpose mode
|
|
|
|
|
// freely transcribe the voice into text
|
|
|
|
|
int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
|
|
|
@ -739,23 +829,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
|
|
|
|
|
// main loop
|
|
|
|
|
while (is_running) {
|
|
|
|
|
// handle Ctrl + C
|
|
|
|
|
{
|
|
|
|
|
SDL_Event event;
|
|
|
|
|
while (SDL_PollEvent(&event)) {
|
|
|
|
|
switch (event.type) {
|
|
|
|
|
case SDL_QUIT:
|
|
|
|
|
{
|
|
|
|
|
is_running = false;
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!is_running) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
is_running = process_sdl_events();
|
|
|
|
|
|
|
|
|
|
// delay
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
@ -842,115 +916,6 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// always prompt mode
|
|
|
|
|
// transcribe the voice into text after valid prompt
|
|
|
|
|
int always_prompt_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
|
|
|
|
bool is_running = true;
|
|
|
|
|
bool ask_prompt = true;
|
|
|
|
|
|
|
|
|
|
float prob = 0.0f;
|
|
|
|
|
|
|
|
|
|
std::vector<float> pcmf32_cur;
|
|
|
|
|
|
|
|
|
|
const std::string k_prompt = params.prompt;
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> words;
|
|
|
|
|
|
|
|
|
|
std::istringstream iss(k_prompt);
|
|
|
|
|
std::string word;
|
|
|
|
|
|
|
|
|
|
while (iss >> word) {
|
|
|
|
|
words.push_back(word);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int k_prompt_length = words.size();
|
|
|
|
|
|
|
|
|
|
// main loop
|
|
|
|
|
while (is_running) {
|
|
|
|
|
// handle Ctrl + C
|
|
|
|
|
{
|
|
|
|
|
SDL_Event event;
|
|
|
|
|
while (SDL_PollEvent(&event)) {
|
|
|
|
|
switch (event.type) {
|
|
|
|
|
case SDL_QUIT:
|
|
|
|
|
{
|
|
|
|
|
is_running = false;
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!is_running) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// delay
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
|
|
|
|
|
|
if (ask_prompt) {
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
|
|
|
|
|
ask_prompt = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
audio.get(2000, pcmf32_cur);
|
|
|
|
|
|
|
|
|
|
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
|
|
|
|
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
|
|
|
|
|
|
|
|
|
int64_t t_ms = 0;
|
|
|
|
|
|
|
|
|
|
// detect the commands
|
|
|
|
|
audio.get(params.command_ms, pcmf32_cur);
|
|
|
|
|
|
|
|
|
|
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
|
|
|
|
|
|
|
|
|
|
std::istringstream iss(txt);
|
|
|
|
|
std::string word;
|
|
|
|
|
std::string prompt;
|
|
|
|
|
std::string command;
|
|
|
|
|
int i = 0;
|
|
|
|
|
int command_length = 0;
|
|
|
|
|
while (iss >> word) {
|
|
|
|
|
if (i == k_prompt_length - 1) {
|
|
|
|
|
prompt += word + ' ';
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
prompt += word + ' ';
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
while (iss >> word) {
|
|
|
|
|
command += word + ' ';
|
|
|
|
|
command_length++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const float sim = similarity(prompt, k_prompt);
|
|
|
|
|
|
|
|
|
|
//debug
|
|
|
|
|
//fprintf(stdout, "command size: %i\n", command_length);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((sim > 0.7f) && (command_length >0)){
|
|
|
|
|
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audio.clear();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
|
whisper_params params;
|
|
|
|
|
|
|
|
|
|