talk : ready for beta testing

pull/155/head
Georgi Gerganov 3 years ago
parent b796c29f40
commit 49b80bab48
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

File diff suppressed because one or more lines are too long

@ -949,7 +949,7 @@ bool gpt2_eval(
/////////////////////////////// GPT-2 END //////////////////////////////// /////////////////////////////// GPT-2 END ////////////////////////////////
constexpr int N_THREAD = 8; constexpr int N_THREAD = 7;
struct gpt2_state { struct gpt2_state {
std::string prompt_base = R"(Hello, how are you? std::string prompt_base = R"(Hello, how are you?
@ -986,7 +986,10 @@ std::mutex g_mutex;
std::thread g_worker; std::thread g_worker;
std::atomic<bool> g_running(false); std::atomic<bool> g_running(false);
bool g_force_speak = false;
std::string g_text_to_speak = ""; std::string g_text_to_speak = "";
std::string g_status = "idle";
std::string g_status_forced = "";
std::string gpt2_gen_text(const std::string & prompt) { std::string gpt2_gen_text(const std::string & prompt) {
int n_past = 0; int n_past = 0;
@ -1044,7 +1047,14 @@ std::string gpt2_gen_text(const std::string & prompt) {
return result; return result;
} }
void talk_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}
void talk_main(size_t index) { void talk_main(size_t index) {
talk_set_status("loading data ...");
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
@ -1058,7 +1068,7 @@ void talk_main(size_t index) {
wparams.print_special_tokens = false; wparams.print_special_tokens = false;
wparams.max_tokens = 32; wparams.max_tokens = 32;
wparams.audio_ctx = 512; wparams.audio_ctx = 768;
wparams.language = "en"; wparams.language = "en";
@ -1082,12 +1092,16 @@ void talk_main(size_t index) {
auto & ctx = g_contexts[index]; auto & ctx = g_contexts[index];
const int64_t step_samples = 5*WHISPER_SAMPLE_RATE; const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE; const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
auto t_last = std::chrono::high_resolution_clock::now(); auto t_last = std::chrono::high_resolution_clock::now();
talk_set_status("listening ...");
while (g_running) { while (g_running) {
const auto t_now = std::chrono::high_resolution_clock::now(); const auto t_now = std::chrono::high_resolution_clock::now();
if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) { if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
{ {
@ -1098,6 +1112,8 @@ void talk_main(size_t index) {
continue; continue;
} }
talk_set_status("listening ...");
{ {
std::unique_lock<std::mutex> lock(g_mutex); std::unique_lock<std::mutex> lock(g_mutex);
@ -1109,7 +1125,7 @@ void talk_main(size_t index) {
continue; continue;
} }
pcmf32 = std::vector<float>(g_pcmf32.end() - step_samples, g_pcmf32.end()); pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
} }
// if energy in during last second is above threshold, then skip // if energy in during last second is above threshold, then skip
@ -1128,12 +1144,16 @@ void talk_main(size_t index) {
energy_all /= pcmf32.size(); energy_all /= pcmf32.size();
energy_1s /= WHISPER_SAMPLE_RATE; energy_1s /= WHISPER_SAMPLE_RATE;
if (energy_1s > 0.1f*energy_all) { if (energy_1s > 0.1f*energy_all && !g_force_speak) {
std::this_thread::sleep_for(std::chrono::milliseconds(10)); std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue; continue;
} }
} }
talk_set_status("processing ...");
g_force_speak = false;
t_last = t_now; t_last = t_now;
{ {
@ -1187,17 +1207,21 @@ void talk_main(size_t index) {
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), ""); text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), ""); text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
talk_set_status("'" + text_heard + "' - thinking how to respond ...");
const std::vector<gpt_vocab::id> tokens = ::gpt_tokenize(g_gpt2.vocab, text_heard); const std::vector<gpt_vocab::id> tokens = ::gpt_tokenize(g_gpt2.vocab, text_heard);
printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str()); printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
std::string text_to_speak; std::string text_to_speak;
if (tokens.size() > 2) { if (tokens.size() > 0) {
text_to_speak = gpt2_gen_text(text_heard + "\n"); text_to_speak = gpt2_gen_text(text_heard + "\n");
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), ""); text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n")); text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
std::lock_guard<std::mutex> lock(g_mutex);
// remove first 2 lines of base prompt // remove first 2 lines of base prompt
{ {
const size_t pos = g_gpt2.prompt_base.find_first_of("\n"); const size_t pos = g_gpt2.prompt_base.find_first_of("\n");
@ -1217,6 +1241,8 @@ void talk_main(size_t index) {
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), ""); text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n")); text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
std::lock_guard<std::mutex> lock(g_mutex);
const size_t pos = g_gpt2.prompt_base.find_first_of("\n"); const size_t pos = g_gpt2.prompt_base.find_first_of("\n");
if (pos != std::string::npos) { if (pos != std::string::npos) {
g_gpt2.prompt_base = g_gpt2.prompt_base.substr(pos + 1); g_gpt2.prompt_base = g_gpt2.prompt_base.substr(pos + 1);
@ -1226,15 +1252,18 @@ void talk_main(size_t index) {
printf("gpt-2: %s\n", text_to_speak.c_str()); printf("gpt-2: %s\n", text_to_speak.c_str());
printf("========================\n"); //printf("========================\n");
printf("gpt-2: prompt_base:\n'%s'\n", g_gpt2.prompt_base.c_str()); //printf("gpt-2: prompt_base:\n'%s'\n", g_gpt2.prompt_base.c_str());
printf("========================\n"); //printf("========================\n");
{ {
std::lock_guard<std::mutex> lock(g_mutex); std::lock_guard<std::mutex> lock(g_mutex);
t_last = std::chrono::high_resolution_clock::now(); t_last = std::chrono::high_resolution_clock::now();
g_text_to_speak = text_to_speak; g_text_to_speak = text_to_speak;
g_pcmf32.clear();
} }
talk_set_status("speaking ...");
} }
} }
@ -1301,6 +1330,24 @@ EMSCRIPTEN_BINDINGS(talk) {
return 0; return 0;
})); }));
emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_force_speak = true;
}
}));
emscripten::function("get_text_context", emscripten::optional_override([]() {
std::string text_context;
{
std::lock_guard<std::mutex> lock(g_mutex);
text_context = g_gpt2.prompt_base;
}
return text_context;
}));
emscripten::function("get_text_to_speak", emscripten::optional_override([]() { emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
std::string text_to_speak; std::string text_to_speak;
@ -1311,4 +1358,22 @@ EMSCRIPTEN_BINDINGS(talk) {
return text_to_speak; return text_to_speak;
})); }));
emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;
{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}
return status;
}));
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
} }

@ -44,18 +44,8 @@
<br><br> <br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (e.g. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<hr> <hr>
<br>
<div id="model-whisper"> <div id="model-whisper">
<span id="model-whisper-status">Whisper model:</span> <span id="model-whisper-status">Whisper model:</span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
@ -82,25 +72,44 @@
<br> <br>
<div id="input_mic"> <div id="input">
<button id="start" onclick="onStart()">Start</button> <button id="start" onclick="onStart()">Start</button>
<button id="stop" onclick="onStop()" disabled>Stop</button> <button id="stop" onclick="onStop()" disabled>Stop</button>
<select id="voice" onchange="onVoiceChange()"> <select id="voice" onchange="onVoiceChange()">
<option value="0">Default</option> <option value="0">Default</option>
</select> </select>
<button id="speak" onclick="onSpeak('Hello')">Say Hello</button> <button id="speak" onclick="onSpeak('Hello')">Say hello</button>
<button id="speak" onclick="onSpeakRandom()">Say something</button>
<button id="speak" onclick="clearCache()">Clear Cache</button> <button id="speak" onclick="clearCache()">Clear Cache</button>
</div> </div>
<br> <br>
<hr><br> <div id="state">
Status: <b><span id="state-status">idle</span></b>
<br> <pre id="state-context">[The text context will be displayed here]</pre>
</div>
<hr>
<!-- textarea with height filling the rest of the page --> Debug output:
<textarea id="output" rows="20"></textarea> <textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (e.g. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<br><br> <br><br>
<div class="cell-version"> <div class="cell-version">
@ -115,16 +124,6 @@
</div> </div>
<script type='text/javascript'> <script type='text/javascript'>
function changeInput(input) {
if (input == 'file') {
document.getElementById('input_file').style.display = 'block';
document.getElementById('input_mic').style.display = 'none';
} else {
document.getElementById('input_file').style.display = 'none';
document.getElementById('input_mic').style.display = 'block';
}
}
var printTextarea = (function() { var printTextarea = (function() {
var element = document.getElementById('output'); var element = document.getElementById('output');
if (element) element.alue = ''; // clear browser cache if (element) element.alue = ''; // clear browser cache
@ -138,7 +137,6 @@
}; };
})(); })();
const kMaxAudio_s = 10;
const kRestartRecording_s = 15; const kRestartRecording_s = 15;
const kSampleRate = 16000; const kSampleRate = 16000;
@ -172,10 +170,10 @@
monitorRunDependencies: function(left) { monitorRunDependencies: function(left) {
}, },
preRun: function() { preRun: function() {
printTextarea('js: preparing ...'); printTextarea('js: Preparing ...');
}, },
postRun: function() { postRun: function() {
printTextarea('js: loaded successfully!'); printTextarea('js: Initialized successfully!');
// populate the voice list // populate the voice list
var voices = synth.getVoices(); var voices = synth.getVoices();
@ -193,9 +191,14 @@
// select random voice // select random voice
if (n > 0) { if (n > 0) {
for (var k = 0; k < 10; k++) {
var i = Math.floor(Math.random() * n); var i = Math.floor(Math.random() * n);
el.selectedIndex = i; el.selectedIndex = i;
voice = voices[document.getElementById('voice').options[i].value]; voice = voices[document.getElementById('voice').options[i].value];
// give preference to Google voices
if (voice.name.startsWith('Google')) break;
}
} }
} }
}; };
@ -443,6 +446,7 @@
var startTime = 0; var startTime = 0;
function stopRecording() { function stopRecording() {
Module.set_status("paused");
doRecording = false; doRecording = false;
audio0 = null; audio0 = null;
audio = null; audio = null;
@ -453,6 +457,8 @@
context = new AudioContext({sampleRate: 16000}); context = new AudioContext({sampleRate: 16000});
} }
Module.set_status("");
document.getElementById('start').disabled = true; document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false; document.getElementById('stop').disabled = false;
@ -566,6 +572,7 @@
synth.speak(msg); synth.speak(msg);
if (doRecording) { if (doRecording) {
Module.set_status("speaking ...");
printTextarea('js: speaking'); printTextarea('js: speaking');
stopRecording(); stopRecording();
var interval = setInterval(function() { var interval = setInterval(function() {
@ -573,16 +580,19 @@
printTextarea('js: done speaking'); printTextarea('js: done speaking');
clearInterval(interval); clearInterval(interval);
startRecording(); startRecording();
} else {
Module.set_status("");
} }
}, 100); }, 100);
} }
} }
function onSpeakRandom() {
Module.force_speak(instance);
}
async function clearCache() { async function clearCache() {
if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) { if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
//const dbs = await indexedDB.databases();
//dbs.forEach(db => { indexedDB.deleteDatabase(db.name) });
indexedDB.deleteDatabase(dbName); indexedDB.deleteDatabase(dbName);
} }
} }
@ -591,7 +601,7 @@
// main // main
// //
var intervalSpeak = null; var intervalUpdate = null;
function onStart() { function onStart() {
if (!instance) { if (!instance) {
@ -609,24 +619,20 @@
startRecording(); startRecording();
intervalSpeak = setInterval(function() { intervalUpdate = setInterval(function() {
var textToSpeak = Module.get_text_to_speak(); var textToSpeak = Module.get_text_to_speak();
if (textToSpeak != null && textToSpeak.length > 1) { if (textToSpeak != null && textToSpeak.length > 1) {
onSpeak(textToSpeak); onSpeak(textToSpeak);
} }
document.getElementById('state-status').innerHTML = Module.get_status();
document.getElementById('state-context').innerHTML = Module.get_text_context();
}, 100); }, 100);
} }
function onStop() { function onStop() {
stopRecording(); stopRecording();
if (instance) {
Module.free(instance);
instance = null;
printTextarea("js: talk instance freed");
}
} }
function onVoiceChange() { function onVoiceChange() {

@ -2750,7 +2750,7 @@ int whisper_full(
} else { } else {
text += whisper_token_to_str(ctx, tokens_cur[i].id); text += whisper_token_to_str(ctx, tokens_cur[i].id);
} }
if (tokens_cur[i].id > whisper_token_beg(ctx)) { if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx)); const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
if (!text.empty()) { if (!text.empty()) {
const auto tt0 = params.speed_up ? 2*t0 : t0; const auto tt0 = params.speed_up ? 2*t0 : t0;

Loading…
Cancel
Save