You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
whisper.cpp/examples/whisper.objc/whisper.objc/ViewController.m

298 lines
9.2 KiB

//
// ViewController.m
// whisper.objc
//
// Created by Georgi Gerganov on 23.10.22.
//
#import "ViewController.h"
#import "whisper.h"
#define NUM_BYTES_PER_BUFFER 16*1024
// callback used to process captured audio
void AudioInputCallback(void * inUserData,
AudioQueueRef inAQ,
AudioQueueBufferRef inBuffer,
const AudioTimeStamp * inStartTime,
UInt32 inNumberPacketDescriptions,
const AudioStreamPacketDescription * inPacketDescs);
@interface ViewController ()
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end
@implementation ViewController
- (void)setupAudioFormat:(AudioStreamBasicDescription*)format
{
format->mSampleRate = WHISPER_SAMPLE_RATE;
format->mFormatID = kAudioFormatLinearPCM;
format->mFramesPerPacket = 1;
format->mChannelsPerFrame = 1;
format->mBytesPerFrame = 2;
format->mBytesPerPacket = 2;
format->mBitsPerChannel = 16;
format->mReserved = 0;
format->mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
}
- (void)viewDidLoad {
[super viewDidLoad];
// whisper.cpp initialization
{
// load the model
NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"];
// check if the model exists
if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
NSLog(@"Model file not found");
return;
}
NSLog(@"Loading model from %@", modelPath);
// create ggml context
stateInp.ctx = whisper_init_from_file([modelPath UTF8String]);
// check if the model was loaded successfully
if (stateInp.ctx == NULL) {
NSLog(@"Failed to load model");
return;
}
}
// initialize audio format and buffers
{
[self setupAudioFormat:&stateInp.dataFormat];
stateInp.n_samples = 0;
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
}
stateInp.isTranscribing = false;
stateInp.isRealtime = false;
}
-(IBAction) stopCapturing {
NSLog(@"Stop capturing");
_labelStatusInp.text = @"Status: Idle";
[_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal];
[_buttonToggleCapture setBackgroundColor:[UIColor grayColor]];
stateInp.isCapturing = false;
AudioQueueStop(stateInp.queue, true);
for (int i = 0; i < NUM_BUFFERS; i++) {
AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]);
}
AudioQueueDispose(stateInp.queue, true);
}
- (IBAction)toggleCapture:(id)sender {
if (stateInp.isCapturing) {
// stop capturing
[self stopCapturing];
return;
}
// initiate audio capturing
NSLog(@"Start capturing");
stateInp.n_samples = 0;
stateInp.vc = (__bridge void *)(self);
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
AudioInputCallback,
&stateInp,
CFRunLoopGetCurrent(),
kCFRunLoopCommonModes,
0,
&stateInp.queue);
if (status == 0) {
for (int i = 0; i < NUM_BUFFERS; i++) {
AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]);
AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL);
}
stateInp.isCapturing = true;
status = AudioQueueStart(stateInp.queue, NULL);
if (status == 0) {
_labelStatusInp.text = @"Status: Capturing";
[sender setTitle:@"Stop Capturing" forState:UIControlStateNormal];
[_buttonToggleCapture setBackgroundColor:[UIColor redColor]];
}
}
if (status != 0) {
[self stopCapturing];
}
}
- (IBAction)onTranscribePrepare:(id)sender {
_textviewResult.text = @"Processing - please wait ...";
if (stateInp.isRealtime) {
[self onRealtime:(id)sender];
}
if (stateInp.isCapturing) {
[self stopCapturing];
}
}
- (IBAction)onRealtime:(id)sender {
stateInp.isRealtime = !stateInp.isRealtime;
if (stateInp.isRealtime) {
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
} else {
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
}
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
}
- (IBAction)onTranscribe:(id)sender {
if (stateInp.isTranscribing) {
return;
}
NSLog(@"Processing %d samples", stateInp.n_samples);
stateInp.isTranscribing = true;
// dispatch the model to a background thread
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
// process captured audio
// convert I16 to F32
for (int i = 0; i < self->stateInp.n_samples; i++) {
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
}
// run the model
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
// get maximum number of threads on this device (max 8)
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
params.print_realtime = true;
params.print_progress = false;
params.print_timestamps = true;
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = max_threads;
params.offset_ms = 0;
params.no_context = true;
params.single_segment = self->stateInp.isRealtime;
CFTimeInterval startTime = CACurrentMediaTime();
whisper_reset_timings(self->stateInp.ctx);
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
NSLog(@"Failed to run the model");
self->_textviewResult.text = @"Failed to run the model";
return;
}
whisper_print_timings(self->stateInp.ctx);
CFTimeInterval endTime = CACurrentMediaTime();
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
// result text
NSString *result = @"";
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
for (int i = 0; i < n_segments; i++) {
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
// append the text to the result
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
}
const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
// append processing time
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time: %5.3f s]", tRecording]];
result = [result stringByAppendingString:[NSString stringWithFormat:@" \n[processing time: %5.3f s]", endTime - startTime]];
// dispatch the result to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
self->_textviewResult.text = result;
self->stateInp.isTranscribing = false;
});
});
}
//
// Callback implementation
//
void AudioInputCallback(void * inUserData,
AudioQueueRef inAQ,
AudioQueueBufferRef inBuffer,
const AudioTimeStamp * inStartTime,
UInt32 inNumberPacketDescriptions,
const AudioStreamPacketDescription * inPacketDescs)
{
StateInp * stateInp = (StateInp*)inUserData;
if (!stateInp->isCapturing) {
NSLog(@"Not capturing, ignoring audio");
return;
}
const int n = inBuffer->mAudioDataByteSize / 2;
NSLog(@"Captured %d new samples", n);
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
NSLog(@"Too much audio data, ignoring");
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc stopCapturing];
});
return;
}
for (int i = 0; i < n; i++) {
stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i];
}
stateInp->n_samples += n;
// put the buffer back in the queue
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
if (stateInp->isRealtime) {
// dipatch onTranscribe() to the main thread
dispatch_async(dispatch_get_main_queue(), ^{
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
[vc onTranscribe:nil];
});
}
}
@end