mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-02 16:53:08 +00:00
Some checks are pending
CI / ubuntu-22 (linux/amd64) (push) Waiting to run
CI / ubuntu-22 (linux/ppc64le) (push) Waiting to run
CI / ubuntu-22-arm64 (linux/arm64) (push) Waiting to run
CI / ubuntu-22-arm-v7 (linux/arm/v7) (push) Waiting to run
CI / macOS-latest (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest (generic/platform=tvOS) (push) Waiting to run
CI / ubuntu-22-gcc (linux/amd64, Debug) (push) Waiting to run
CI / ubuntu-22-gcc (linux/amd64, Release) (push) Waiting to run
CI / ubuntu-22-gcc (linux/ppc64le, Debug) (push) Waiting to run
CI / ubuntu-22-gcc (linux/ppc64le, Release) (push) Waiting to run
CI / ubuntu-22-gcc-arm64 (linux/arm64, Debug) (push) Waiting to run
CI / ubuntu-22-gcc-arm64 (linux/arm64, Release) (push) Waiting to run
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Debug) (push) Waiting to run
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/amd64, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/amd64, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/arm64, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/arm64, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/ppc64le, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/ppc64le, Release) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, ADDRESS) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, THREAD) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, UNDEFINED) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Waiting to run
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Waiting to run
CI / windows-blas (Win32, ON, Release, x86, 2.28.5, ON) (push) Waiting to run
CI / windows-blas (x64, ON, Release, x64, 2.28.5, ON) (push) Waiting to run
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Waiting to run
CI / windows-cublas (x64, Release, ON, 12.2.0, ON, 2.28.5) (push) Waiting to run
CI / emscripten (Release) (push) Waiting to run
CI / ios-xcode-build (Release) (push) Waiting to run
CI / android (push) Waiting to run
CI / quantize (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Waiting to run
* examples : use xcframework in whisper.objc example This commit updates the whisper.objc example to use the xcframework. The motivation for this to be consistent with the swift example and to also act as a reference for how to use the xcframework in an objc project. Resolves: https://github.com/ggerganov/whisper.cpp/issues/2881 * examples : setup audio session viewDidload This commit adds the setup of the audio session in the viewDidload method of the ViewController.m file. This is necessary to allow the app to record audio. The motivation for this is that without this it was not possible to caputue audio from the microphone. It was possible to click on the Capture button but nothing happened after that, and the button was not marked red indicating that the button could be clicked again to stop capturing. With this change it is possible to capture audio from the microphone and get it transcribed.
318 lines
9.9 KiB
Objective-C
318 lines
9.9 KiB
Objective-C
//
|
|
// ViewController.m
|
|
// whisper.objc
|
|
//
|
|
// Created by Georgi Gerganov on 23.10.22.
|
|
//
|
|
|
|
#import "ViewController.h"
|
|
#import <whisper/whisper.h>
|
|
|
|
|
|
#define NUM_BYTES_PER_BUFFER 16*1024
|
|
|
|
// callback used to process captured audio
|
|
void AudioInputCallback(void * inUserData,
|
|
AudioQueueRef inAQ,
|
|
AudioQueueBufferRef inBuffer,
|
|
const AudioTimeStamp * inStartTime,
|
|
UInt32 inNumberPacketDescriptions,
|
|
const AudioStreamPacketDescription * inPacketDescs);
|
|
|
|
@interface ViewController ()
|
|
|
|
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
|
|
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
|
|
|
|
@end
|
|
|
|
@implementation ViewController
|
|
|
|
- (void)setupAudioFormat:(AudioStreamBasicDescription*)format
|
|
{
|
|
format->mSampleRate = WHISPER_SAMPLE_RATE;
|
|
format->mFormatID = kAudioFormatLinearPCM;
|
|
format->mFramesPerPacket = 1;
|
|
format->mChannelsPerFrame = 1;
|
|
format->mBytesPerFrame = 2;
|
|
format->mBytesPerPacket = 2;
|
|
format->mBitsPerChannel = 16;
|
|
format->mReserved = 0;
|
|
format->mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
}
|
|
|
|
- (void)viewDidLoad {
|
|
[super viewDidLoad];
|
|
|
|
// whisper.cpp initialization
|
|
{
|
|
// load the model
|
|
NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"];
|
|
|
|
// check if the model exists
|
|
if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
|
|
NSLog(@"Model file not found");
|
|
return;
|
|
}
|
|
|
|
NSLog(@"Loading model from %@", modelPath);
|
|
|
|
// create ggml context
|
|
|
|
struct whisper_context_params cparams = whisper_context_default_params();
|
|
#if TARGET_OS_SIMULATOR
|
|
cparams.use_gpu = false;
|
|
NSLog(@"Running on simulator, using CPU");
|
|
#endif
|
|
stateInp.ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
|
|
|
|
// check if the model was loaded successfully
|
|
if (stateInp.ctx == NULL) {
|
|
NSLog(@"Failed to load model");
|
|
return;
|
|
}
|
|
}
|
|
|
|
// initialize audio format and buffers
|
|
{
|
|
[self setupAudioFormat:&stateInp.dataFormat];
|
|
|
|
stateInp.n_samples = 0;
|
|
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
|
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
|
// Set up audio session
|
|
NSError *error = nil;
|
|
|
|
[[AVAudioSession sharedInstance] setCategory:AVAudioSessionCategoryRecord error:&error];
|
|
if (error) {
|
|
NSLog(@"Error setting audio session category: %@", error);
|
|
}
|
|
|
|
[[AVAudioSession sharedInstance] setActive:YES error:&error];
|
|
if (error) {
|
|
NSLog(@"Error activating audio session: %@", error);
|
|
}
|
|
|
|
}
|
|
|
|
stateInp.isTranscribing = false;
|
|
stateInp.isRealtime = false;
|
|
}
|
|
|
|
-(IBAction) stopCapturing {
|
|
NSLog(@"Stop capturing");
|
|
|
|
_labelStatusInp.text = @"Status: Idle";
|
|
|
|
[_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal];
|
|
[_buttonToggleCapture setBackgroundColor:[UIColor grayColor]];
|
|
|
|
stateInp.isCapturing = false;
|
|
|
|
AudioQueueStop(stateInp.queue, true);
|
|
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]);
|
|
}
|
|
|
|
AudioQueueDispose(stateInp.queue, true);
|
|
}
|
|
|
|
- (IBAction)toggleCapture:(id)sender {
|
|
if (stateInp.isCapturing) {
|
|
// stop capturing
|
|
[self stopCapturing];
|
|
|
|
return;
|
|
}
|
|
|
|
// initiate audio capturing
|
|
NSLog(@"Start capturing");
|
|
|
|
stateInp.n_samples = 0;
|
|
stateInp.vc = (__bridge void *)(self);
|
|
|
|
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
|
|
AudioInputCallback,
|
|
&stateInp,
|
|
CFRunLoopGetCurrent(),
|
|
kCFRunLoopCommonModes,
|
|
0,
|
|
&stateInp.queue);
|
|
|
|
if (status == 0) {
|
|
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]);
|
|
AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL);
|
|
}
|
|
|
|
stateInp.isCapturing = true;
|
|
status = AudioQueueStart(stateInp.queue, NULL);
|
|
if (status == 0) {
|
|
_labelStatusInp.text = @"Status: Capturing";
|
|
[sender setTitle:@"Stop Capturing" forState:UIControlStateNormal];
|
|
[_buttonToggleCapture setBackgroundColor:[UIColor redColor]];
|
|
}
|
|
}
|
|
|
|
if (status != 0) {
|
|
[self stopCapturing];
|
|
}
|
|
}
|
|
|
|
- (IBAction)onTranscribePrepare:(id)sender {
|
|
_textviewResult.text = @"Processing - please wait ...";
|
|
|
|
if (stateInp.isRealtime) {
|
|
[self onRealtime:(id)sender];
|
|
}
|
|
|
|
if (stateInp.isCapturing) {
|
|
[self stopCapturing];
|
|
}
|
|
}
|
|
|
|
- (IBAction)onRealtime:(id)sender {
|
|
stateInp.isRealtime = !stateInp.isRealtime;
|
|
|
|
if (stateInp.isRealtime) {
|
|
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
|
|
} else {
|
|
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
|
|
}
|
|
|
|
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
|
|
}
|
|
|
|
- (IBAction)onTranscribe:(id)sender {
|
|
if (stateInp.isTranscribing) {
|
|
return;
|
|
}
|
|
|
|
NSLog(@"Processing %d samples", stateInp.n_samples);
|
|
|
|
stateInp.isTranscribing = true;
|
|
|
|
// dispatch the model to a background thread
|
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
|
// process captured audio
|
|
// convert I16 to F32
|
|
for (int i = 0; i < self->stateInp.n_samples; i++) {
|
|
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
|
|
}
|
|
|
|
// run the model
|
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
|
|
// get maximum number of threads on this device (max 8)
|
|
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
|
|
|
|
params.print_realtime = true;
|
|
params.print_progress = false;
|
|
params.print_timestamps = true;
|
|
params.print_special = false;
|
|
params.translate = false;
|
|
params.language = "en";
|
|
params.n_threads = max_threads;
|
|
params.offset_ms = 0;
|
|
params.no_context = true;
|
|
params.single_segment = self->stateInp.isRealtime;
|
|
params.no_timestamps = params.single_segment;
|
|
|
|
CFTimeInterval startTime = CACurrentMediaTime();
|
|
|
|
whisper_reset_timings(self->stateInp.ctx);
|
|
|
|
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
|
|
NSLog(@"Failed to run the model");
|
|
self->_textviewResult.text = @"Failed to run the model";
|
|
|
|
return;
|
|
}
|
|
|
|
whisper_print_timings(self->stateInp.ctx);
|
|
|
|
CFTimeInterval endTime = CACurrentMediaTime();
|
|
|
|
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
|
|
|
|
// result text
|
|
NSString *result = @"";
|
|
|
|
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
|
|
for (int i = 0; i < n_segments; i++) {
|
|
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
|
|
|
|
// append the text to the result
|
|
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
|
}
|
|
|
|
const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
|
|
|
|
// append processing time
|
|
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time: %5.3f s]", tRecording]];
|
|
result = [result stringByAppendingString:[NSString stringWithFormat:@" \n[processing time: %5.3f s]", endTime - startTime]];
|
|
|
|
// dispatch the result to the main thread
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
self->_textviewResult.text = result;
|
|
self->stateInp.isTranscribing = false;
|
|
});
|
|
});
|
|
}
|
|
|
|
//
|
|
// Callback implementation
|
|
//
|
|
|
|
void AudioInputCallback(void * inUserData,
|
|
AudioQueueRef inAQ,
|
|
AudioQueueBufferRef inBuffer,
|
|
const AudioTimeStamp * inStartTime,
|
|
UInt32 inNumberPacketDescriptions,
|
|
const AudioStreamPacketDescription * inPacketDescs)
|
|
{
|
|
StateInp * stateInp = (StateInp*)inUserData;
|
|
|
|
if (!stateInp->isCapturing) {
|
|
NSLog(@"Not capturing, ignoring audio");
|
|
return;
|
|
}
|
|
|
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
|
|
NSLog(@"Captured %d new samples", n);
|
|
|
|
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
|
|
NSLog(@"Too much audio data, ignoring");
|
|
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
|
[vc stopCapturing];
|
|
});
|
|
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
}
|
|
|
|
stateInp->n_samples += n;
|
|
|
|
// put the buffer back in the queue
|
|
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
|
|
|
|
if (stateInp->isRealtime) {
|
|
// dipatch onTranscribe() to the main thread
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
|
[vc onTranscribe:nil];
|
|
});
|
|
}
|
|
}
|
|
|
|
@end
|