From e266cb072324b74710d084e59b10a1d4bcbada86 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Nov 2022 17:28:28 +0200 Subject: [PATCH] whisper.objc : add real-time processing (#97) Similar to the "stream" app --- .../whisper.objc/Base.lproj/Main.storyboard | 19 ++- .../whisper.objc/ViewController.h | 4 + .../whisper.objc/ViewController.m | 127 +++++++++++++----- whisper.cpp | 32 ++--- whisper.h | 70 +++++----- 5 files changed, 161 insertions(+), 91 deletions(-) diff --git a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard index 5c92ba8..065ccac 100644 --- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard +++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard @@ -1,8 +1,8 @@ - + - + @@ -40,7 +40,7 @@ - + + @@ -64,6 +76,7 @@ + diff --git a/examples/whisper.objc/whisper.objc/ViewController.h b/examples/whisper.objc/whisper.objc/ViewController.h index 3595518..e32a326 100644 --- a/examples/whisper.objc/whisper.objc/ViewController.h +++ b/examples/whisper.objc/whisper.objc/ViewController.h @@ -20,6 +20,8 @@ typedef struct { int ggwaveId; bool isCapturing; + bool isTranscribing; + bool isRealtime; UILabel * labelReceived; AudioQueueRef queue; @@ -31,6 +33,8 @@ typedef struct float * audioBufferF32; struct whisper_context * ctx; + + void * vc; } StateInp; @interface ViewController : UIViewController diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m index 4804471..d294178 100644 --- a/examples/whisper.objc/whisper.objc/ViewController.m +++ b/examples/whisper.objc/whisper.objc/ViewController.m @@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData, @property (weak, nonatomic) IBOutlet UILabel *labelStatusInp; @property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture; @property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe; +@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime; @property (weak, nonatomic) IBOutlet UITextView *textviewResult; @end @@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData, stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t)); stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float)); } + + stateInp.isTranscribing = false; + stateInp.isRealtime = false; } -(IBAction) stopCapturing { @@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData, NSLog(@"Start capturing"); stateInp.n_samples = 0; + stateInp.vc = (__bridge void *)(self); OSStatus status = AudioQueueNewInput(&stateInp.dataFormat, AudioInputCallback, @@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData, - (IBAction)onTranscribePrepare:(id)sender { _textviewResult.text = @"Processing - please wait ..."; + if (stateInp.isRealtime) { + [self onRealtime:(id)sender]; + } + if (stateInp.isCapturing) { - // stop capturing [self stopCapturing]; + } +} - return; +- (IBAction)onRealtime:(id)sender { + stateInp.isRealtime = !stateInp.isRealtime; + + if (stateInp.isRealtime) { + [_buttonRealtime setBackgroundColor:[UIColor greenColor]]; + } else { + [_buttonRealtime setBackgroundColor:[UIColor grayColor]]; } + + NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF"); } - (IBAction)onTranscribe:(id)sender { + if (stateInp.isTranscribing) { + return; + } + NSLog(@"Processing %d samples", stateInp.n_samples); - // process captured audio - // convert I16 to F32 - for (int i = 0; i < stateInp.n_samples; i++) { - stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f; - } + stateInp.isTranscribing = true; + + // dispatch the model to a background thread + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + // process captured audio + // convert I16 to F32 + for (int i = 0; i < self->stateInp.n_samples; i++) { + self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f; + } - // run the model - struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + // run the model + struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - params.print_realtime = true; - params.print_progress = false; - params.print_timestamps = true; - params.print_special = false; - params.translate = false; - params.language = "en"; - params.n_threads = 4; - params.offset_ms = 0; + // get maximum number of threads on this device (max 8) + const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]); - CFTimeInterval startTime = CACurrentMediaTime(); + params.print_realtime = true; + params.print_progress = false; + params.print_timestamps = true; + params.print_special = false; + params.translate = false; + params.language = "en"; + params.n_threads = max_threads; + params.offset_ms = 0; + params.single_segment = self->stateInp.isRealtime; - if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) { - NSLog(@"Failed to run the model"); - _textviewResult.text = @"Failed to run the model"; + CFTimeInterval startTime = CACurrentMediaTime(); - return; - } + whisper_reset_timings(self->stateInp.ctx); - CFTimeInterval endTime = CACurrentMediaTime(); + if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) { + NSLog(@"Failed to run the model"); + self->_textviewResult.text = @"Failed to run the model"; - // clear the text in the textview - _textviewResult.text = @""; + return; + } - int n_segments = whisper_full_n_segments(stateInp.ctx); - for (int i = 0; i < n_segments; i++) { - const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i); + whisper_print_timings(self->stateInp.ctx); - // append the text to the textview - _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]]; - } + CFTimeInterval endTime = CACurrentMediaTime(); + + NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads); - // internal model timing - whisper_print_timings(stateInp.ctx); + // result text + NSString *result = @""; - NSLog(@"\nProcessing time: %5.3f", endTime - startTime); + int n_segments = whisper_full_n_segments(self->stateInp.ctx); + for (int i = 0; i < n_segments; i++) { + const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i); - _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]]; + // append the text to the result + result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]]; + } + + // append processing time + result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]]; + + // dispatch the result to the main thread + dispatch_async(dispatch_get_main_queue(), ^{ + self->_textviewResult.text = result; + self->stateInp.isTranscribing = false; + }); + }); } // -// Callback implmentation +// Callback implementation // void AudioInputCallback(void * inUserData, @@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData, if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) { NSLog(@"Too much audio data, ignoring"); + + dispatch_async(dispatch_get_main_queue(), ^{ + ViewController * vc = (__bridge ViewController *)(stateInp->vc); + [vc stopCapturing]; + }); + return; } @@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData, // put the buffer back in the queue AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL); + + if (stateInp->isRealtime) { + // dipatch onTranscribe() to the main thread + dispatch_async(dispatch_get_main_queue(), ^{ + ViewController * vc = (__bridge ViewController *)(stateInp->vc); + [vc onTranscribe:nil]; + }); + } } @end diff --git a/whisper.cpp b/whisper.cpp index 9e27ab1..2daf411 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) { ctx->t_decode_us = 0; } +const char * whisper_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + + return s.c_str(); +} + //////////////////////////////////////////////////////////////////////////// struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) { @@ -2863,7 +2878,7 @@ int whisper_full_parallel( struct whisper_full_params params, const float * samples, int n_samples, - const int n_processors) { + int n_processors) { if (n_processors == 1) { return whisper_full(ctx, params, samples, n_samples); } @@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int return ctx->result_all[i_segment].tokens[i_token].p; } -const char * whisper_print_system_info(void) { - static std::string s; - - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - - return s.c_str(); -} - // ================================================================================================= // diff --git a/whisper.h b/whisper.h index b0fb2d9..4b5fbcc 100644 --- a/whisper.h +++ b/whisper.h @@ -72,16 +72,16 @@ extern "C" { whisper_token id; // token id whisper_token tid; // forced timestamp token id - float p; // probability of the token - float pt; // probability of the timestamp token - float ptsum; // sum of probabilities of all timestamp tokens + float p; // probability of the token + float pt; // probability of the timestamp token + float ptsum; // sum of probabilities of all timestamp tokens // token-level timestamp data // do not use if you haven't computed token-level timestamps - int64_t t0; // start time of the token - int64_t t1; // end time of the token + int64_t t0; // start time of the token + int64_t t1; // end time of the token - float vlen; // voice length of the token + float vlen; // voice length of the token } whisper_token_data; // Allocates all memory needed for the model and loads the model from the given file. @@ -96,9 +96,9 @@ extern "C" { // Returns 0 on success WHISPER_API int whisper_pcm_to_mel( struct whisper_context * ctx, - const float * samples, - int n_samples, - int n_threads); + const float * samples, + int n_samples, + int n_threads); // This can be used to set a custom log mel spectrogram inside the provided whisper context. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. @@ -106,9 +106,9 @@ extern "C" { // Returns 0 on success WHISPER_API int whisper_set_mel( struct whisper_context * ctx, - const float * data, - int n_len, - int n_mel); + const float * data, + int n_len, + int n_mel); // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context. // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first. @@ -116,8 +116,8 @@ extern "C" { // Returns 0 on success WHISPER_API int whisper_encode( struct whisper_context * ctx, - int offset, - int n_threads); + int offset, + int n_threads); // Run the Whisper decoder to obtain the logits and probabilities for the next token. // Make sure to call whisper_encode() first. @@ -126,10 +126,10 @@ extern "C" { // Returns 0 on success WHISPER_API int whisper_decode( struct whisper_context * ctx, - const whisper_token * tokens, - int n_tokens, - int n_past, - int n_threads); + const whisper_token * tokens, + int n_tokens, + int n_past, + int n_threads); // Token sampling methods. // These are provided for convenience and can be used after each call to whisper_decode(). @@ -169,6 +169,9 @@ extern "C" { WHISPER_API void whisper_print_timings(struct whisper_context * ctx); WHISPER_API void whisper_reset_timings(struct whisper_context * ctx); + // Print system information + WHISPER_API const char * whisper_print_system_info(void); + //////////////////////////////////////////////////////////////////////////// // Available sampling strategies @@ -187,12 +190,12 @@ extern "C" { int n_threads; int n_max_text_ctx; - int offset_ms; // start offset in ms - int duration_ms; // audio duration to process in ms + int offset_ms; // start offset in ms + int duration_ms; // audio duration to process in ms bool translate; bool no_context; - bool single_segment; // force single segment output (useful for streaming) + bool single_segment; // force single segment output (useful for streaming) bool print_special; bool print_progress; bool print_realtime; @@ -206,8 +209,8 @@ extern "C" { int max_tokens; // max tokens per segment (0 = no limit) // [EXPERIMENTAL] speed-up techniques - bool speed_up; // speed-up the audio by 2x using Phase Vocoder - int audio_ctx; // overwrite the audio context size (0 = use default) + bool speed_up; // speed-up the audio by 2x using Phase Vocoder + int audio_ctx; // overwrite the audio context size (0 = use default) // tokens to provide the whisper model as initial prompt // these are prepended to any existing text context from a previous call @@ -235,20 +238,20 @@ extern "C" { // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text // Uses the specified decoding strategy to obtain the text. WHISPER_API int whisper_full( - struct whisper_context * ctx, - struct whisper_full_params params, - const float * samples, - int n_samples); + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples); // Split the input audio in chunks and process each chunk separately using whisper_full() // It seems this approach can offer some speedup in some cases. // However, the transcription accuracy can be worse at the beginning and end of each chunk. WHISPER_API int whisper_full_parallel( - struct whisper_context * ctx, - struct whisper_full_params params, - const float * samples, - int n_samples, - const int n_processors); + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + int n_processors); // Number of generated text segments. // A segment can be a few words, a sentence, or even a paragraph. @@ -275,9 +278,6 @@ extern "C" { // Get the probability of the specified token in the specified segment. WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token); - // Print system information - WHISPER_API const char * whisper_print_system_info(void); - #ifdef __cplusplus } #endif