whisper.objc : add real-time processing (#97)

Similar to the "stream" app
pull/194/head
Georgi Gerganov 2 years ago
parent c207eed431
commit e266cb0723
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r"> <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
<device id="retina6_0" orientation="portrait" appearance="light"/> <device id="retina6_0" orientation="portrait" appearance="light"/>
<dependencies> <dependencies>
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/> <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
<capability name="Safe area layout guides" minToolsVersion="9.0"/> <capability name="Safe area layout guides" minToolsVersion="9.0"/>
<capability name="System colors in document resources" minToolsVersion="11.0"/> <capability name="System colors in document resources" minToolsVersion="11.0"/>
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/> <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/> <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/> <color key="backgroundColor" systemColor="systemBackgroundColor"/>
<color key="textColor" systemColor="labelColor"/> <color key="textColor" systemColor="labelColor"/>
<fontDescription key="fontDescription" type="system" pointSize="20"/> <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/> <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
</textView> </textView>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i"> <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/> <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
</connections> </connections>
</button> </button>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
<rect key="frame" x="199" y="191" width="156" height="49"/>
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
<color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
<color key="tintColor" systemColor="opaqueSeparatorColor"/>
<state key="normal" title="Real-time">
<color key="titleColor" systemColor="labelColor"/>
</state>
<connections>
<action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
</connections>
</button>
</subviews> </subviews>
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/> <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/> <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
</constraints> </constraints>
</view> </view>
<connections> <connections>
<outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/> <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/> <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/> <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>

@ -20,6 +20,8 @@ typedef struct
{ {
int ggwaveId; int ggwaveId;
bool isCapturing; bool isCapturing;
bool isTranscribing;
bool isRealtime;
UILabel * labelReceived; UILabel * labelReceived;
AudioQueueRef queue; AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
float * audioBufferF32; float * audioBufferF32;
struct whisper_context * ctx; struct whisper_context * ctx;
void * vc;
} StateInp; } StateInp;
@interface ViewController : UIViewController @interface ViewController : UIViewController

@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp; @property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture; @property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe; @property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult; @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end @end
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t)); stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float)); stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
} }
stateInp.isTranscribing = false;
stateInp.isRealtime = false;
} }
-(IBAction) stopCapturing { -(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
NSLog(@"Start capturing"); NSLog(@"Start capturing");
stateInp.n_samples = 0; stateInp.n_samples = 0;
stateInp.vc = (__bridge void *)(self);
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat, OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
AudioInputCallback, AudioInputCallback,
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
- (IBAction)onTranscribePrepare:(id)sender { - (IBAction)onTranscribePrepare:(id)sender {
_textviewResult.text = @"Processing - please wait ..."; _textviewResult.text = @"Processing - please wait ...";
if (stateInp.isRealtime) {
[self onRealtime:(id)sender];
}
if (stateInp.isCapturing) { if (stateInp.isCapturing) {
// stop capturing
[self stopCapturing]; [self stopCapturing];
}
}
return; - (IBAction)onRealtime:(id)sender {
stateInp.isRealtime = !stateInp.isRealtime;
if (stateInp.isRealtime) {
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
} else {
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
} }
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
} }
- (IBAction)onTranscribe:(id)sender { - (IBAction)onTranscribe:(id)sender {
if (stateInp.isTranscribing) {
return;
}
NSLog(@"Processing %d samples", stateInp.n_samples); NSLog(@"Processing %d samples", stateInp.n_samples);
// process captured audio stateInp.isTranscribing = true;
// convert I16 to F32
for (int i = 0; i < stateInp.n_samples; i++) { // dispatch the model to a background thread
stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f; dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
} // process captured audio
// convert I16 to F32
for (int i = 0; i < self->stateInp.n_samples; i++) {
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
}
// run the model // run the model
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
params.print_realtime = true; // get maximum number of threads on this device (max 8)
params.print_progress = false; const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
params.print_timestamps = true;
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = 4;
params.offset_ms = 0;
CFTimeInterval startTime = CACurrentMediaTime(); params.print_realtime = true;
params.print_progress = false;
params.print_timestamps = true;
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = max_threads;
params.offset_ms = 0;
params.single_segment = self->stateInp.isRealtime;
if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) { CFTimeInterval startTime = CACurrentMediaTime();
NSLog(@"Failed to run the model");
_textviewResult.text = @"Failed to run the model";
return; whisper_reset_timings(self->stateInp.ctx);
}
CFTimeInterval endTime = CACurrentMediaTime(); if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
NSLog(@"Failed to run the model");
self->_textviewResult.text = @"Failed to run the model";
// clear the text in the textview return;
_textviewResult.text = @""; }
int n_segments = whisper_full_n_segments(stateInp.ctx); whisper_print_timings(self->stateInp.ctx);
for (int i = 0; i < n_segments; i++) {
const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
// append the text to the textview CFTimeInterval endTime = CACurrentMediaTime();
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
} NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
// internal model timing // result text
whisper_print_timings(stateInp.ctx); NSString *result = @"";
NSLog(@"\nProcessing time: %5.3f", endTime - startTime); int n_segments = whisper_full_n_segments(self->stateInp.ctx);
for (int i = 0; i < n_segments; i++) {
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]]; // append the text to the result
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
}
// append processing time
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
// dispatch the result to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
self->_textviewResult.text = result;
self->stateInp.isTranscribing = false;
});
});
} }
// //
// Callback implmentation // Callback implementation
// //
void AudioInputCallback(void * inUserData, void AudioInputCallback(void * inUserData,
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) { if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
NSLog(@"Too much audio data, ignoring"); NSLog(@"Too much audio data, ignoring");
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc stopCapturing];
});
return; return;
} }
@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
// put the buffer back in the queue // put the buffer back in the queue
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL); AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
if (stateInp->isRealtime) {
// dipatch onTranscribe() to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc onTranscribe:nil];
});
}
} }
@end @end

@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
ctx->t_decode_us = 0; ctx->t_decode_us = 0;
} }
const char * whisper_print_system_info(void) {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
return s.c_str();
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) { struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
struct whisper_full_params params, struct whisper_full_params params,
const float * samples, const float * samples,
int n_samples, int n_samples,
const int n_processors) { int n_processors) {
if (n_processors == 1) { if (n_processors == 1) {
return whisper_full(ctx, params, samples, n_samples); return whisper_full(ctx, params, samples, n_samples);
} }
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
return ctx->result_all[i_segment].tokens[i_token].p; return ctx->result_all[i_segment].tokens[i_token].p;
} }
const char * whisper_print_system_info(void) {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
return s.c_str();
}
// ================================================================================================= // =================================================================================================
// //

@ -72,16 +72,16 @@ extern "C" {
whisper_token id; // token id whisper_token id; // token id
whisper_token tid; // forced timestamp token id whisper_token tid; // forced timestamp token id
float p; // probability of the token float p; // probability of the token
float pt; // probability of the timestamp token float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data // token-level timestamp data
// do not use if you haven't computed token-level timestamps // do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token int64_t t0; // start time of the token
int64_t t1; // end time of the token int64_t t1; // end time of the token
float vlen; // voice length of the token float vlen; // voice length of the token
} whisper_token_data; } whisper_token_data;
// Allocates all memory needed for the model and loads the model from the given file. // Allocates all memory needed for the model and loads the model from the given file.
@ -96,9 +96,9 @@ extern "C" {
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_pcm_to_mel( WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx, struct whisper_context * ctx,
const float * samples, const float * samples,
int n_samples, int n_samples,
int n_threads); int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context. // This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@ -106,9 +106,9 @@ extern "C" {
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_set_mel( WHISPER_API int whisper_set_mel(
struct whisper_context * ctx, struct whisper_context * ctx,
const float * data, const float * data,
int n_len, int n_len,
int n_mel); int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context. // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first. // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@ -116,8 +116,8 @@ extern "C" {
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_encode( WHISPER_API int whisper_encode(
struct whisper_context * ctx, struct whisper_context * ctx,
int offset, int offset,
int n_threads); int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token. // Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first. // Make sure to call whisper_encode() first.
@ -126,10 +126,10 @@ extern "C" {
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_decode( WHISPER_API int whisper_decode(
struct whisper_context * ctx, struct whisper_context * ctx,
const whisper_token * tokens, const whisper_token * tokens,
int n_tokens, int n_tokens,
int n_past, int n_past,
int n_threads); int n_threads);
// Token sampling methods. // Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode(). // These are provided for convenience and can be used after each call to whisper_decode().
@ -169,6 +169,9 @@ extern "C" {
WHISPER_API void whisper_print_timings(struct whisper_context * ctx); WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx); WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
// Print system information
WHISPER_API const char * whisper_print_system_info(void);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Available sampling strategies // Available sampling strategies
@ -187,12 +190,12 @@ extern "C" {
int n_threads; int n_threads;
int n_max_text_ctx; int n_max_text_ctx;
int offset_ms; // start offset in ms int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms int duration_ms; // audio duration to process in ms
bool translate; bool translate;
bool no_context; bool no_context;
bool single_segment; // force single segment output (useful for streaming) bool single_segment; // force single segment output (useful for streaming)
bool print_special; bool print_special;
bool print_progress; bool print_progress;
bool print_realtime; bool print_realtime;
@ -206,8 +209,8 @@ extern "C" {
int max_tokens; // max tokens per segment (0 = no limit) int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques // [EXPERIMENTAL] speed-up techniques
bool speed_up; // speed-up the audio by 2x using Phase Vocoder bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default) int audio_ctx; // overwrite the audio context size (0 = use default)
// tokens to provide the whisper model as initial prompt // tokens to provide the whisper model as initial prompt
// these are prepended to any existing text context from a previous call // these are prepended to any existing text context from a previous call
@ -235,20 +238,20 @@ extern "C" {
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text. // Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full( WHISPER_API int whisper_full(
struct whisper_context * ctx, struct whisper_context * ctx,
struct whisper_full_params params, struct whisper_full_params params,
const float * samples, const float * samples,
int n_samples); int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full() // Split the input audio in chunks and process each chunk separately using whisper_full()
// It seems this approach can offer some speedup in some cases. // It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk. // However, the transcription accuracy can be worse at the beginning and end of each chunk.
WHISPER_API int whisper_full_parallel( WHISPER_API int whisper_full_parallel(
struct whisper_context * ctx, struct whisper_context * ctx,
struct whisper_full_params params, struct whisper_full_params params,
const float * samples, const float * samples,
int n_samples, int n_samples,
const int n_processors); int n_processors);
// Number of generated text segments. // Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph. // A segment can be a few words, a sentence, or even a paragraph.
@ -275,9 +278,6 @@ extern "C" {
// Get the probability of the specified token in the specified segment. // Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
// Print system information
WHISPER_API const char * whisper_print_system_info(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

Loading…
Cancel
Save