Merge branch 'ggerganov:master' into master

pull/222/head
katsu560 3 years ago committed by GitHub
commit e4bd5d8ccc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -60,8 +60,7 @@ ifeq ($(UNAME_M),x86_64)
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
endif
ifeq ($(UNAME_S),Linux)
else ifeq ($(UNAME_S),Linux)
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
@ -207,3 +206,11 @@ tiny.en tiny base.en base small.en small medium.en medium large: main
./main -m models/ggml-$@.bin -f $$f ; \
echo "" ; \
done
#
# Tests
#
.PHONY: tests
tests:
bash ./tests/run-tests.sh

@ -36,7 +36,7 @@ As an example, here is a video of running the model on an iPhone 13 device - ful
https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
You can also easily make your own offline voice assistant application:
You can also easily make your own offline voice assistant application: [command](examples/command)
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
@ -454,6 +454,25 @@ in [models](models).
## Examples
There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)
There are various examples of using the library for different projects in the [examples](examples) folder.
Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
| Example | Web | Description |
| --- | --- | --- |
| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
| [bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
| | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.

@ -21,6 +21,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
add_subdirectory(whisper.wasm)
add_subdirectory(stream.wasm)
add_subdirectory(command.wasm)
add_subdirectory(talk.wasm)
else()
add_subdirectory(main)

@ -0,0 +1,47 @@
#
# libcommand
#
set(TARGET libcommand)
add_executable(${TARGET}
emscripten.cpp
)
target_link_libraries(${TARGET} PRIVATE
whisper
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside command.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libcommand.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
#
# command.wasm
#
set(TARGET command.wasm)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)

@ -0,0 +1,23 @@
# command.wasm
This is a basic Voice Assistant example that accepts voice commands from the microphone.
It runs in fully in the browser via WebAseembly.
Online demo: https://whisper.ggerganov.com/command/
Terminal version: [examples/command](/examples/command)
## Build instructions
```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j
# copy the produced page to your HTTP path
cp bin/command.wasm/* /path/to/html/
cp bin/libcommand.worker.js /path/to/html/
```

@ -0,0 +1,408 @@
#include "ggml.h"
#include "whisper.h"
#include <emscripten.h>
#include <emscripten/bind.h>
#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
constexpr int N_THREAD = 8;
std::vector<struct whisper_context *> g_contexts(4, nullptr);
std::mutex g_mutex;
std::thread g_worker;
std::atomic<bool> g_running(false);
std::string g_status = "";
std::string g_status_forced = "";
std::string g_transcribed = "";
std::vector<float> g_pcmf32;
static std::string trim(const std::string & s) {
std::regex e("^\\s+|\\s+$");
return std::regex_replace(s, e, "");
}
static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
const float rc = 1.0f / (2.0f * M_PI * cutoff);
const float dt = 1.0f / sample_rate;
const float alpha = dt / (rc + dt);
float y = data[0];
for (size_t i = 1; i < data.size(); i++) {
y = alpha * (y + data[i] - data[i - 1]);
data[i] = y;
}
}
// compute similarity between two strings using Levenshtein distance
static float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}
void command_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}
bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
const int n_samples = pcmf32.size();
const int n_samples_last = (sample_rate * last_ms) / 1000;
if (n_samples_last >= n_samples) {
// not enough samples - assume no speech
return false;
}
if (freq_thold > 0.0f) {
high_pass_filter(pcmf32, freq_thold, sample_rate);
}
float energy_all = 0.0f;
float energy_last = 0.0f;
for (size_t i = 0; i < n_samples; i++) {
energy_all += fabsf(pcmf32[i]);
if (i >= n_samples - n_samples_last) {
energy_last += fabsf(pcmf32[i]);
}
}
energy_all /= n_samples;
energy_last /= n_samples_last;
if (verbose) {
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
}
if (energy_last > vad_thold*energy_all) {
return false;
}
return true;
}
std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
const int64_t n_samples = (ms * sample_rate) / 1000;
int64_t n_take = 0;
if (g_pcmf32.size() < n_samples) {
n_take = g_pcmf32.size();
} else {
n_take = n_samples;
}
audio.resize(n_take);
std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
}
void command_main(size_t index) {
command_set_status("loading data ...");
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;
wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance
wparams.language = "en";
printf("command: using %d threads\n", wparams.n_threads);
bool is_running = true;
bool have_prompt = false;
bool ask_prompt = true;
bool print_energy = false;
float prob0 = 0.0f;
float prob = 0.0f;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
const std::string k_prompt = "Ok Whisper, start listening for commands.";
// whisper context
auto & ctx = g_contexts[index];
const int32_t vad_ms = 2000;
const int32_t prompt_ms = 5000;
const int32_t command_ms = 4000;
const float vad_thold = 0.1f;
const float freq_thold = -1.0f;
while (g_running) {
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
command_set_status(txt);
}
ask_prompt = false;
}
int64_t t_ms = 0;
{
command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
command_set_status("Speech detected! Processing ...");
if (!have_prompt) {
command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
const float sim = similarity(txt, k_prompt);
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
ask_prompt = true;
} else {
fprintf(stdout, "\n");
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
command_set_status(txt);
}
// save the audio for the prompt
pcmf32_prompt = pcmf32_cur;
have_prompt = true;
}
} else {
command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
// prepend the prompt audio
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
prob = 100.0f*(prob - prob0);
fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
// find the prompt in the text
float best_sim = 0.0f;
size_t best_len = 0;
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
const auto prompt = txt.substr(0, n);
const float sim = similarity(prompt, k_prompt);
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
if (sim > best_sim) {
best_sim = sim;
best_len = n;
}
}
const std::string command = ::trim(txt.substr(best_len));
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
command_set_status(txt);
}
{
std::lock_guard<std::mutex> lock(g_mutex);
g_transcribed = command;
}
}
g_pcmf32.clear();
}
}
}
if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}
EMSCRIPTEN_BINDINGS(command) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init(path_model.c_str());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
command_main(i);
});
return i + 1;
} else {
return (size_t) 0;
}
}
}
return (size_t) 0;
}));
emscripten::function("free", emscripten::optional_override([](size_t index) {
if (g_running) {
g_running = false;
}
}));
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
--index;
if (index >= g_contexts.size()) {
return -1;
}
if (g_contexts[index] == nullptr) {
return -2;
}
{
std::lock_guard<std::mutex> lock(g_mutex);
const int n = audio["length"].as<int>();
emscripten::val heap = emscripten::val::module_property("HEAPU8");
emscripten::val memory = heap["buffer"];
g_pcmf32.resize(n);
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
memoryView.call<void>("set", audio);
}
return 0;
}));
emscripten::function("get_transcribed", emscripten::optional_override([]() {
std::string transcribed;
{
std::lock_guard<std::mutex> lock(g_mutex);
transcribed = std::move(g_transcribed);
}
return transcribed;
}));
emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;
{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}
return status;
}));
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
}

@ -0,0 +1,386 @@
<!doctype html>
<html lang="en-us">
<head>
<title>command : Voice assistant example using Whisper + WebAssembly</title>
<style>
#output {
width: 100%;
height: 100%;
margin: 0 auto;
margin-top: 10px;
border-left: 0px;
border-right: 0px;
padding-left: 0px;
padding-right: 0px;
display: block;
background-color: black;
color: white;
font-size: 10px;
font-family: 'Lucida Console', Monaco, monospace;
outline: none;
white-space: pre;
overflow-wrap: normal;
overflow-x: scroll;
}
</style>
</head>
<body>
<div id="main-container">
<b>command : Voice assistant example using Whisper + WebAssembly</b>
<br><br>
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
<br><br>
<hr>
Select the model you would like to use, click the "Start" button and follow the instructions.
<br><br>
<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<span id="fetch-whisper-progress"></span>
<!--
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-->
</div>
<br>
<div id="input">
<button id="start" onclick="onStart()" disabled>Start</button>
<button id="stop" onclick="onStop()" disabled>Stop</button>
<button id="clear" onclick="clearCache()">Clear Cache</button>
</div>
<br>
<div id="state">
Status: <b><span id="state-status">not started</span></b>
<pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
</div>
<hr>
Debug output:
<textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<div class="cell-version">
<span>
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
</span>
</div>
</div>
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
// web audio context
var context = null;
// audio data
var audio = null;
var audio0 = null;
// the command instance
var instance = null;
// model name
var model_whisper = null;
var Module = {
print: printTextarea,
printErr: printTextarea,
setStatus: function(text) {
printTextarea('js: ' + text);
},
monitorRunDependencies: function(left) {
},
preRun: function() {
printTextarea('js: Preparing ...');
},
postRun: function() {
printTextarea('js: Initialized successfully!');
}
};
//
// fetch models
//
let dbVersion = 1
let dbName = 'whisper.ggerganov.com';
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
function storeFS(fname, buf) {
// write to WASM file using FS_createDataFile
// if the file exists, delete it
try {
Module.FS_unlink(fname);
} catch (e) {
// ignore
}
Module.FS_createDataFile("/", fname, buf, true, true);
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
if (model_whisper != null) {
document.getElementById('start').disabled = false;
document.getElementById('stop' ).disabled = true;
}
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
};
let sizes = {
'tiny.en': 75,
'base.en': 142,
};
let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];
model_whisper = model;
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
//
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
audio0 = null;
audio = null;
context = null;
}
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
Module.set_status("");
document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false;
doRecording = true;
startTime = Date.now();
var chunks = [];
var stream = null;
navigator.mediaDevices.getUserMedia({audio: true, video: false})
.then(function(s) {
stream = s;
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = function(e) {
chunks.push(e.data);
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
var reader = new FileReader();
reader.onload = function(event) {
var buf = new Uint8Array(reader.result);
if (!context) {
return;
}
context.decodeAudioData(buf.buffer, function(audioBuffer) {
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
var source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start(0);
offlineContext.startRendering().then(function(renderedBuffer) {
audio = renderedBuffer.getChannelData(0);
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
if (audio0 != null) {
audioAll.set(audio0, 0);
}
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
if (instance) {
Module.set_audio(instance, audioAll);
}
});
}, function(e) {
audio = null;
});
}
reader.readAsArrayBuffer(blob);
};
mediaRecorder.onstop = function(e) {
if (doRecording) {
setTimeout(function() {
startRecording();
});
}
};
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
});
var interval = setInterval(function() {
if (!doRecording) {
clearInterval(interval);
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
document.getElementById('start').disabled = false;
document.getElementById('stop').disabled = true;
mediaRecorder = null;
}
// if audio length is more than kRestartRecording_s seconds, restart recording
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
if (doRecording) {
//printTextarea('js: restarting recording');
clearInterval(interval);
audio0 = audio;
audio = null;
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
}
}
}, 100);
}
//
// main
//
var nLines = 0;
var intervalUpdate = null;
var transcribedAll = '';
function onStart() {
if (!instance) {
instance = Module.init('whisper.bin');
if (instance) {
printTextarea("js: whisper initialized, instance: " + instance);
}
}
if (!instance) {
printTextarea("js: failed to initialize whisper");
return;
}
startRecording();
intervalUpdate = setInterval(function() {
var transcribed = Module.get_transcribed();
if (transcribed != null && transcribed.length > 1) {
transcribedAll += transcribed + '<br>';
nLines++;
// if more than 10 lines, remove the first line
if (nLines > 10) {
var i = transcribedAll.indexOf('<br>');
if (i > 0) {
transcribedAll = transcribedAll.substring(i + 4);
nLines--;
}
}
}
document.getElementById('state-status').innerHTML = Module.get_status();
document.getElementById('state-transcribed').innerHTML = transcribedAll;
}, 100);
}
function onStop() {
stopRecording();
}
</script>
<script type="text/javascript" src="command.js"></script>
</body>
</html>

@ -13,6 +13,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
Web version: [examples/command.wasm](/examples/command.wasm)
## Building
The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

@ -535,7 +535,7 @@ int main(int argc, char ** argv) {
bool is_running = true;
bool have_prompt = false;
bool ask_prompt = true;
bool ask_prompt = true;
float prob0 = 0.0f;
float prob = 0.0f;

@ -1,5 +1,16 @@
#!/bin/bash
# Simple tool to record audio from the microphone and generate a karaoke video
# Usage:
#
# cd whisper.cpp
# make
#
# ./examples/generate-karaoke.sh [model] [step_ms]
#
# Press Ctrl+C to stop recording
#
executable="./main"
model="base.en"
model_path="models/ggml-$model.bin"

@ -0,0 +1,98 @@
#!/bin/bash
set -eo pipefail
# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
# Idea by @semiformal-net
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
#
# TODO:
# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
# way to produce a continuous stream of audio chunks.
#
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
fmt=aac # the audio format extension of the stream (TODO: auto detect)
step_s=30
model="base.en"
if [ -z "$1" ]; then
echo "Usage: $0 stream_url [step_s] [model]"
echo ""
echo " Example:"
echo " $0 $url $step_s $model"
echo ""
echo "No url specified, using default: $url"
else
url="$1"
fi
if [ -n "$2" ]; then
step_s="$2"
fi
if [ -n "$3" ]; then
model="$3"
fi
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
running=1
trap "running=0" SIGINT SIGTERM
printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
# continuous stream in native fmt (this file will grow forever!)
ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
if [ $? -ne 0 ]; then
printf "Error: ffmpeg failed to capture audio stream\n"
exit 1
fi
printf "Buffering audio. Please wait...\n\n"
sleep $(($step_s))
# do not stop script on error
set +e
i=0
SECONDS=0
while [ $running -eq 1 ]; do
# extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
err=1
while [ $err -ne 0 ]; do
if [ $i -gt 0 ]; then
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
else
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
fi
err=$(cat /tmp/whisper-live.err | wc -l)
done
./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
sleep 1
done
((i=i+1))
done
killall -v ffmpeg
killall -v main

@ -607,6 +607,19 @@ int main(int argc, char ** argv) {
wparams.new_segment_callback_user_data = &user_data;
}
// example for abort mechanism
// in this example, we do not abort the processing, but we could if the flag is set to true
// the callback is called before every encoder run - if it returns false, the processing is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
wparams.encoder_begin_callback_user_data = &is_aborted;
}
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 10;

@ -51,7 +51,7 @@ void stream_main(size_t index) {
wparams.language = "en";
printf("stream: using %d threads\n", N_THREAD);
printf("stream: using %d threads\n", wparams.n_threads);
std::vector<float> pcmf32;

@ -100,12 +100,6 @@
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
const kRestartRecording_s = 15;
const kSampleRate = 16000;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
// web audio context
var context = null;
@ -204,10 +198,17 @@
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
@ -219,7 +220,7 @@
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: 16000,
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
@ -292,7 +293,7 @@
}
};
mediaRecorder.start(5000);
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
@ -326,7 +327,7 @@
});
}
}
}, 250);
}, 100);
}
//

@ -68,7 +68,7 @@ void talk_main(size_t index) {
g_gpt2 = gpt2_init("gpt-2.bin");
printf("talk: using %d threads\n", N_THREAD);
printf("talk: using %d threads\n", wparams.n_threads);
std::vector<float> pcmf32;

@ -160,12 +160,6 @@
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
const kRestartRecording_s = 15;
const kSampleRate = 16000;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
// web audio context
var context = null;
@ -342,10 +336,17 @@
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
@ -357,7 +358,7 @@
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: 16000,
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
@ -431,7 +432,7 @@
}
};
mediaRecorder.start(250);
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
@ -466,7 +467,7 @@
});
}
}
}, 250);
}, 100);
}
//

@ -5,6 +5,10 @@ The inference runs locally, on-device.
https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
Real-time transcription demo:
https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-bca8-0e46d9da2364.mp4
## Usage
```java

@ -309,6 +309,7 @@
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = P8JZH34X63;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_FILE = whisper.objc/Info.plist;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
@ -336,6 +337,7 @@
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = P8JZH34X63;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_FILE = whisper.objc/Info.plist;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
<device id="retina6_0" orientation="portrait" appearance="light"/>
<dependencies>
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
<capability name="System colors in document resources" minToolsVersion="11.0"/>
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
<color key="textColor" systemColor="labelColor"/>
<fontDescription key="fontDescription" type="system" pointSize="20"/>
<fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
</textView>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
</connections>
</button>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
<rect key="frame" x="199" y="191" width="156" height="49"/>
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
<color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
<color key="tintColor" systemColor="opaqueSeparatorColor"/>
<state key="normal" title="Real-time">
<color key="titleColor" systemColor="labelColor"/>
</state>
<connections>
<action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
</connections>
</button>
</subviews>
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
</constraints>
</view>
<connections>
<outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>

@ -20,6 +20,8 @@ typedef struct
{
int ggwaveId;
bool isCapturing;
bool isTranscribing;
bool isRealtime;
UILabel * labelReceived;
AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
float * audioBufferF32;
struct whisper_context * ctx;
void * vc;
} StateInp;
@interface ViewController : UIViewController

@ -21,9 +21,10 @@ void AudioInputCallback(void * inUserData,
@interface ViewController ()
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end
@ -32,7 +33,7 @@ void AudioInputCallback(void * inUserData,
- (void)setupAudioFormat:(AudioStreamBasicDescription*)format
{
format->mSampleRate = 16000;
format->mSampleRate = WHISPER_SAMPLE_RATE;
format->mFormatID = kAudioFormatLinearPCM;
format->mFramesPerPacket = 1;
format->mChannelsPerFrame = 1;
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
}
stateInp.isTranscribing = false;
stateInp.isRealtime = false;
}
-(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
NSLog(@"Start capturing");
stateInp.n_samples = 0;
stateInp.vc = (__bridge void *)(self);
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
AudioInputCallback,
@ -141,67 +146,105 @@ void AudioInputCallback(void * inUserData,
- (IBAction)onTranscribePrepare:(id)sender {
_textviewResult.text = @"Processing - please wait ...";
if (stateInp.isRealtime) {
[self onRealtime:(id)sender];
}
if (stateInp.isCapturing) {
// stop capturing
[self stopCapturing];
}
}
return;
- (IBAction)onRealtime:(id)sender {
stateInp.isRealtime = !stateInp.isRealtime;
if (stateInp.isRealtime) {
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
} else {
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
}
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
}
- (IBAction)onTranscribe:(id)sender {
if (stateInp.isTranscribing) {
return;
}
NSLog(@"Processing %d samples", stateInp.n_samples);
// process captured audio
// convert I16 to F32
for (int i = 0; i < stateInp.n_samples; i++) {
stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
}
stateInp.isTranscribing = true;
// run the model
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
// dispatch the model to a background thread
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
// process captured audio
// convert I16 to F32
for (int i = 0; i < self->stateInp.n_samples; i++) {
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
}
params.print_realtime = true;
params.print_progress = false;
params.print_timestamps = true;
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = 4;
params.offset_ms = 0;
// run the model
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
CFTimeInterval startTime = CACurrentMediaTime();
// get maximum number of threads on this device (max 8)
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
NSLog(@"Failed to run the model");
_textviewResult.text = @"Failed to run the model";
params.print_realtime = true;
params.print_progress = false;
params.print_timestamps = true;
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = max_threads;
params.offset_ms = 0;
params.no_context = true;
params.single_segment = self->stateInp.isRealtime;
return;
}
CFTimeInterval startTime = CACurrentMediaTime();
CFTimeInterval endTime = CACurrentMediaTime();
whisper_reset_timings(self->stateInp.ctx);
// clear the text in the textview
_textviewResult.text = @"";
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
NSLog(@"Failed to run the model");
self->_textviewResult.text = @"Failed to run the model";
int n_segments = whisper_full_n_segments(stateInp.ctx);
for (int i = 0; i < n_segments; i++) {
const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
return;
}
// append the text to the textview
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
}
whisper_print_timings(self->stateInp.ctx);
CFTimeInterval endTime = CACurrentMediaTime();
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
// result text
NSString *result = @"";
// internal model timing
whisper_print_timings(stateInp.ctx);
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
for (int i = 0; i < n_segments; i++) {
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
// append the text to the result
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
}
const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
// append processing time
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time: %5.3f s]", tRecording]];
result = [result stringByAppendingString:[NSString stringWithFormat:@" \n[processing time: %5.3f s]", endTime - startTime]];
// dispatch the result to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
self->_textviewResult.text = result;
self->stateInp.isTranscribing = false;
});
});
}
//
// Callback implmentation
// Callback implementation
//
void AudioInputCallback(void * inUserData,
@ -224,6 +267,12 @@ void AudioInputCallback(void * inUserData,
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
NSLog(@"Too much audio data, ignoring");
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc stopCapturing];
});
return;
}
@ -235,6 +284,14 @@ void AudioInputCallback(void * inUserData,
// put the buffer back in the queue
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
if (stateInp->isRealtime) {
// dipatch onTranscribe() to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc onTranscribe:nil];
});
}
}
@end

@ -225,12 +225,6 @@
}
};
const kMaxAudio_s = 120;
const kSampleRate = 16000;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
// web audio context
var context = null;
@ -348,9 +342,21 @@
// audio file
//
const kMaxAudio_s = 120;
const kSampleRate = 16000;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function loadAudio(event) {
if (!context) {
context = new AudioContext({sampleRate: 16000});
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
var file = event.target.files[0] || null;
@ -410,7 +416,13 @@
// update progress information
function startRecording() {
if (!context) {
context = new AudioContext({sampleRate: 16000});
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
document.getElementById('start').disabled = true;

@ -0,0 +1,132 @@
#!/usr/bin/env bash
# Small shell script to more easily automatically download and transcribe live stream VODs.
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
# Use `./transcribe-vod help` to print help info.
# MIT License
# Copyright (c) 2022 Daniils Petrovs
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set -Eeuo pipefail
# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
msg() {
echo >&2 -e "${1-}"
}
cleanup() {
msg "Cleaning up..."
rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
}
print_help() {
echo "Usage: ./transcribe-vod <video_url>"
echo "See configurable env variables in the script"
echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
echo "Requirements: ffmpeg yt-dlp whisper"
echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
}
check_requirements() {
if ! command -v ffmpeg &>/dev/null; then
echo "ffmpeg is required (https://ffmpeg.org)."
exit 1
fi
if ! command -v yt-dlp &>/dev/null; then
echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
exit 1
fi
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
WHISPER_EXECUTABLE="./main"
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
echo "Whisper is required (https://github.com/ggerganov/whisper.cpp)."
exit 1
fi
fi
}
if [[ $# -lt 1 ]]; then
print_help
exit 1
fi
if [[ "$1" == "help" ]]; then
print_help
exit 0
fi
temp_dir="tmp"
source_url="$1"
check_requirements
msg "Downloading VOD..."
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
yt-dlp \
-f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
--embed-thumbnail \
--embed-chapters \
--xattrs \
"${source_url}" -o "${temp_dir}/vod.mp4"
msg "Extracting audio and resampling..."
ffmpeg -i "${temp_dir}/vod.mp4" \
-hide_banner \
-loglevel error \
-ar 16000 \
-ac 1 \
-c:a \
pcm_s16le -y "vod-resampled.wav"
msg "Transcribing to subtitle file..."
msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
$WHISPER_EXECUTABLE \
-m "${MODEL_PATH}" \
-l "${WHISPER_LANG}" \
-f "vod-resampled.wav" \
-t 8 \
-osrt \
--translate
msg "Embedding subtitle track..."
ffmpeg -i "${temp_dir}/vod.mp4" \
-hide_banner \
-loglevel error \
-i "vod-resampled.wav.srt" \
-c copy \
-c:s mov_text \
-y res.mp4
cleanup
msg "Done! Your finished file is ready: res.mp4"

@ -0,0 +1,30 @@
#!/bin/bash
#
# This is a helper script to deploy all WebAssembly examples to my node
# Run from the build directory:
#
# cd build-em
# ../extra/deploy-wasm.sh
#
# check if emcmake is available
if ! command -v emcmake &> /dev/null
then
echo "Error: emscripten environment is not set up"
exit
fi
emcmake cmake .. && make -j
if [ $? -ne 0 ]; then
echo "Error: build failed"
exit
fi
# copy all wasm files to the node
scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/ && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
scp bin/stream.wasm/* root@linode0:/var/www/html/whisper/stream/ && scp bin/libstream.worker.js root@linode0:/var/www/html/whisper/stream/
scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
scp bin/talk.wasm/* root@linode0:/var/www/html/whisper/talk/ && scp bin/libtalk.worker.js root@linode0:/var/www/html/whisper/talk/
echo "Done"
exit

@ -147,7 +147,7 @@ static inline uint32_t fp32_to_bits(float f) {
return fp32.as_bits;
}
inline float ggml_fp16_to_fp32(ggml_fp16_t h) {
float ggml_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
@ -170,7 +170,7 @@ inline float ggml_fp16_to_fp32(ggml_fp16_t h) {
return fp32_from_bits(result);
}
inline ggml_fp16_t ggml_fp32_to_fp16(float f) {
ggml_fp16_t ggml_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;

3
tests/.gitignore vendored

@ -0,0 +1,3 @@
*.wav
*.ogg
*.wav.txt

@ -0,0 +1 @@
My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]

@ -0,0 +1 @@
Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.

@ -0,0 +1 @@
This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.

@ -0,0 +1 @@
Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!

@ -0,0 +1,125 @@
#!/bin/bash
# This scripts run the selected model agains a collection of audio files from the web.
# It downloads, converts and transcribes each file and then compares the result with the expected reference
# transcription. The comparison is performed using git's diff command and shows the differences at the character level.
# It can be used to quickly verify that the model is working as expected across a wide range of audio files.
# I.e. like an integration test. The verification is done by visual inspection of the diff output.
#
# The reference data can be for example generated using the original OpenAI Whisper implementation, or entered manually.
#
# Feel free to suggest extra audio files to add to the list.
# Make sure they are between 1-3 minutes long since we don't want to make the test too slow.
#
# Usage:
#
# ./tests/run-tests.sh <model_name>
#
cd `dirname $0`
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [ $# -eq 0 ]; then
printf "Usage: $0 [model]\n\n"
printf "No model specified. Aborting\n"
list_models
exit 1
fi
model=$1
main="../main"
if [ ! -f ../models/ggml-$model.bin ]; then
printf "Model $model not found. Aborting\n"
list_models
exit 1
fi
if [ ! -f $main ]; then
printf "Executable $main not found. Aborting\n"
exit 1
fi
# add various audio files for testing purposes here
# the order of the files is important so don't change the existing order
# when adding new files, make sure to add the expected "ref.txt" file with the correct transcript
urls_en=(
"https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg"
"https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg"
"https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
)
urls_es=(
"https://upload.wikimedia.org/wikipedia/commons/c/c1/La_contaminacion_del_agua.ogg"
)
urls_it=(
)
urls_pt=(
)
urls_de=(
)
urls_jp=(
)
urls_ru=(
)
function run_lang() {
lang=$1
shift
urls=("$@")
i=0
for url in "${urls[@]}"; do
echo "- [$lang] Processing '$url' ..."
ext="${url##*.}"
fname_src="$lang-${i}.${ext}"
fname_dst="$lang-${i}-16khz.wav"
if [ ! -f $fname_src ]; then
wget --quiet --show-progress -O $fname_src $url
fi
if [ ! -f $fname_dst ]; then
ffmpeg -loglevel -0 -y -i $fname_src -ar 16000 -ac 1 -c:a pcm_s16le $fname_dst
if [ $? -ne 0 ]; then
echo "Error: ffmpeg failed to convert $fname_src to $fname_dst"
exit 1
fi
fi
$main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
git diff --no-index --word-diff=color --word-diff-regex=. $fname_dst.txt $lang-$i-ref.txt
i=$(($i+1))
done
}
run_lang "en" "${urls_en[@]}"
if [[ $model != *.en ]]; then
run_lang "es" "${urls_es[@]}"
run_lang "it" "${urls_it[@]}"
run_lang "pt" "${urls_pt[@]}"
run_lang "de" "${urls_de[@]}"
run_lang "jp" "${urls_jp[@]}"
run_lang "ru" "${urls_ru[@]}"
fi

@ -1846,7 +1846,9 @@ static bool whisper_decode(
// the most basic sampling scheme - select the top token
static whisper_token_data whisper_sample_best(
const whisper_vocab & vocab,
const float * probs) {
const float * probs,
bool force_timestamp,
bool is_initial) {
whisper_token_data result = {
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
};
@ -1869,7 +1871,18 @@ static whisper_token_data whisper_sample_best(
max_tx = std::max(max_tx, probs_id[i].first);
}
for (int i = vocab.token_beg; i < n_logits; i++) {
const auto i0 = is_initial ? vocab.token_beg + 101 : vocab.token_beg;
const auto i1 = is_initial ? vocab.token_beg + 101 : n_logits;
// the initial timestamp cannot be larger than 100
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
if (is_initial) {
for (int i = i0; i < n_logits; ++ i) {
probs_id[i].first = -INFINITY;
}
}
for (int i = vocab.token_beg; i < i1; i++) {
sum_ts += probs_id[i].first;
if (probs_id[i].first > max_ts) {
max_ts = probs_id[i].first;
@ -1879,7 +1892,7 @@ static whisper_token_data whisper_sample_best(
// if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
// timestamp token
if (sum_ts > max_tx) {
if (sum_ts > max_tx || force_timestamp) {
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
for (int i = 0; i < vocab.token_beg; i++) {
probs_id[i].first = -INFINITY;
@ -1921,39 +1934,6 @@ static whisper_token_data whisper_sample_best(
return result;
}
// samples only from the timestamps tokens
static whisper_vocab::id whisper_sample_timestamp(
const whisper_vocab & vocab,
const float * probs) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, whisper_vocab::id>> probs_id;
probs_id.reserve(n_logits);
for (int i = vocab.token_beg + 1; i < n_logits; i++) {
probs_id.push_back(std::make_pair(probs[i], i));
}
const int top_k = 10;
// find the top K tokens
std::partial_sort(
probs_id.begin(),
probs_id.begin() + top_k, probs_id.end(),
[](const std::pair<double, whisper_vocab::id> & a, const std::pair<double, whisper_vocab::id> & b) {
return a.first > b.first;
});
probs_id.resize(top_k);
//printf("\n");
//for (int i = 0; i < (int) probs_id.size(); i++) {
// printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second);
//}
return probs_id[0].second;
}
// 500 -> 00:05.000
// 6000 -> 01:00.000
static std::string to_timestamp(int64_t t, bool comma = false) {
@ -2284,19 +2264,17 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) {
const int64_t t_start_sample_us = ggml_time_us();
// TODO: simplify
auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), false, false);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
return res;
}
whisper_token whisper_sample_timestamp(struct whisper_context * ctx) {
struct whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial) {
const int64_t t_start_sample_us = ggml_time_us();
// TODO: simplify
auto res = whisper_sample_timestamp(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), true, is_initial);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@ -2360,11 +2338,11 @@ whisper_token whisper_token_beg(struct whisper_context * ctx) {
return ctx->vocab.token_beg;
}
whisper_token whisper_token_translate() {
whisper_token whisper_token_translate(void) {
return whisper_vocab::token_translate;
}
whisper_token whisper_token_transcribe() {
whisper_token whisper_token_transcribe(void) {
return whisper_vocab::token_transcribe;
}
@ -2386,6 +2364,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
ctx->t_decode_us = 0;
}
const char * whisper_print_system_info(void) {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
return s.c_str();
}
////////////////////////////////////////////////////////////////////////////
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2436,6 +2429,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.new_segment_callback =*/ nullptr,
/*.new_segment_callback_user_data =*/ nullptr,
/*.encoder_begin_callback =*/ nullptr,
/*.encoder_begin_callback_user_data =*/ nullptr,
};
} break;
case WHISPER_SAMPLING_BEAM_SEARCH:
@ -2482,6 +2478,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.new_segment_callback =*/ nullptr,
/*.new_segment_callback_user_data =*/ nullptr,
/*.encoder_begin_callback =*/ nullptr,
/*.encoder_begin_callback_user_data =*/ nullptr,
};
} break;
}
@ -2644,6 +2643,13 @@ int whisper_full(
break;
}
if (params.encoder_begin_callback) {
if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
break;
}
}
// encode audio features starting at offset seek
if (whisper_encode(ctx, seek, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to encode\n", __func__);
@ -2666,7 +2672,6 @@ int whisper_full(
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
bool done = false;
int seek_delta = 100*WHISPER_CHUNK_SIZE;
// print the prompt
@ -2680,7 +2685,9 @@ int whisper_full(
int result_len = 0;
tokens_cur.clear();
for (int i = 0; i < whisper_n_text_ctx(ctx)/2 - 4; ++i) {
bool failed = false;
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to decode\n", __func__);
return 8;
@ -2697,15 +2704,19 @@ int whisper_full(
// feel free to experiment!
//
{
auto token = whisper_sample_best(ctx);
if (i == 0) {
token.tid = whisper_token_beg(ctx);
}
const auto token = (i == 0) ? whisper_sample_timestamp(ctx, true) : whisper_sample_best(ctx);
// timestamp token - update sliding window
if (token.id > whisper_token_beg(ctx)) {
seek_delta = 2*(token.id - whisper_token_beg(ctx));
const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
// do not allow to go back in time
if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
seek_delta > seek_delta_new && result_len < i) {
break;
}
seek_delta = seek_delta_new;
result_len = i + 1;
}
@ -2724,8 +2735,8 @@ int whisper_full(
if (seek + seek_delta + 100 >= seek_end) {
result_len = i + 1;
} else {
// TODO: figure out how to resolve this
fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__);
failed = true;
break;
}
}
@ -2744,11 +2755,21 @@ int whisper_full(
}
}
if (done) {
// sometimes, the decoding can get stuck in a repetition loop
// this is a simple strategy to avoid such cases - we simply flag the decoding as failed and advance
// the sliding window by 1 second
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
failed = true;
break;
}
}
if (failed) {
fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__);
seek += 100;
continue;
}
// shrink down to result_len
tokens_cur.resize(result_len);
@ -2863,7 +2884,7 @@ int whisper_full_parallel(
struct whisper_full_params params,
const float * samples,
int n_samples,
const int n_processors) {
int n_processors) {
if (n_processors == 1) {
return whisper_full(ctx, params, samples, n_samples);
}
@ -2921,10 +2942,6 @@ int whisper_full_parallel(
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
}
const size_t memory_size =
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
}
}
@ -3044,21 +3061,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
return ctx->result_all[i_segment].tokens[i_token].p;
}
const char * whisper_print_system_info() {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
return s.c_str();
}
// =================================================================================================
//
@ -3145,9 +3147,6 @@ static void whisper_exp_compute_token_level_timestamps(
const int64_t t0 = segment.t0;
const int64_t t1 = segment.t1;
const int s0 = timestamp_to_sample(t0, n_samples);
const int s1 = timestamp_to_sample(t1, n_samples);
const int n = tokens.size();
if (n == 0) {

@ -72,16 +72,16 @@ extern "C" {
whisper_token id; // token id
whisper_token tid; // forced timestamp token id
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token
int64_t t1; // end time of the token
int64_t t0; // start time of the token
int64_t t1; // end time of the token
float vlen; // voice length of the token
float vlen; // voice length of the token
} whisper_token_data;
// Allocates all memory needed for the model and loads the model from the given file.
@ -96,9 +96,9 @@ extern "C" {
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx,
const float * samples,
int n_samples,
int n_threads);
const float * samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@ -106,9 +106,9 @@ extern "C" {
// Returns 0 on success
WHISPER_API int whisper_set_mel(
struct whisper_context * ctx,
const float * data,
int n_len,
int n_mel);
const float * data,
int n_len,
int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@ -116,8 +116,8 @@ extern "C" {
// Returns 0 on success
WHISPER_API int whisper_encode(
struct whisper_context * ctx,
int offset,
int n_threads);
int offset,
int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
@ -126,10 +126,10 @@ extern "C" {
// Returns 0 on success
WHISPER_API int whisper_decode(
struct whisper_context * ctx,
const whisper_token * tokens,
int n_tokens,
int n_past,
int n_threads);
const whisper_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
@ -137,7 +137,7 @@ extern "C" {
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
// Return the id of the specified language, returns -1 if not found
WHISPER_API int whisper_lang_id(const char * lang);
@ -162,13 +162,16 @@ extern "C" {
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
// Task tokens
WHISPER_API whisper_token whisper_token_translate ();
WHISPER_API whisper_token whisper_token_transcribe();
WHISPER_API whisper_token whisper_token_translate (void);
WHISPER_API whisper_token whisper_token_transcribe(void);
// Performance information
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
// Print system information
WHISPER_API const char * whisper_print_system_info(void);
////////////////////////////////////////////////////////////////////////////
// Available sampling strategies
@ -182,17 +185,25 @@ extern "C" {
// Use the whisper_full_...() functions to obtain the text segments
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
// Parameters for the whisper_full() function
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
// whisper_full_default_params()
struct whisper_full_params {
enum whisper_sampling_strategy strategy;
int n_threads;
int n_max_text_ctx;
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
bool translate;
bool no_context;
bool single_segment; // force single segment output (useful for streaming)
bool single_segment; // force single segment output (useful for streaming)
bool print_special;
bool print_progress;
bool print_realtime;
@ -206,8 +217,8 @@ extern "C" {
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
// tokens to provide the whisper model as initial prompt
// these are prepended to any existing text context from a previous call
@ -228,6 +239,9 @@ extern "C" {
whisper_new_segment_callback new_segment_callback;
void * new_segment_callback_user_data;
whisper_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data;
};
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -235,20 +249,20 @@ extern "C" {
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples);
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full()
// It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
WHISPER_API int whisper_full_parallel(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples,
const int n_processors);
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples,
int n_processors);
// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
@ -275,9 +289,6 @@ extern "C" {
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
// Print system information
WHISPER_API const char * whisper_print_system_info();
#ifdef __cplusplus
}
#endif

Loading…
Cancel
Save