whisper.objc : add real-time processing (#97)

Similar to the "stream" app
2025-06-22 00:13:35 +00:00 · 2022-11-26 17:28:28 +02:00
parent c207eed431
commit e266cb0723
5 changed files with 169 additions and 99 deletions
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina6_0" orientation="portrait" appearance="light"/>
    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="System colors in document resources" minToolsVersion="11.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
                                    <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                </connections>
                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+                                <rect key="frame" x="199" y="191" width="156" height="49"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+                                <state key="normal" title="Real-time">
+                                    <color key="titleColor" systemColor="labelColor"/>
+                                </state>
+                                <connections>
+                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+                                </connections>
+                            </button>
                        </subviews>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
                        </constraints>
                    </view>
                    <connections>
+                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                        <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                        <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                        <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@ -20,6 +20,8 @@ typedef struct
 {
    int ggwaveId;
    bool isCapturing;
+    bool isTranscribing;
+    bool isRealtime;
    UILabel * labelReceived;

    AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
    float   * audioBufferF32;

    struct whisper_context * ctx;
+
+    void * vc;
 } StateInp;

@interface ViewController : UIViewController
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
@property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;

@end
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
    }
+
+    stateInp.isTranscribing = false;
+    stateInp.isRealtime = false;
 }

 -(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
    NSLog(@"Start capturing");

    stateInp.n_samples = 0;
+    stateInp.vc = (__bridge void *)(self);

    OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                         AudioInputCallback,
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
 - (IBAction)onTranscribePrepare:(id)sender {
    _textviewResult.text = @"Processing - please wait ...";

-    if (stateInp.isCapturing) {
-        // stop capturing
-        [self stopCapturing];
+    if (stateInp.isRealtime) {
+        [self onRealtime:(id)sender];
+    }

-        return;
+    if (stateInp.isCapturing) {
+        [self stopCapturing];
    }
 }

+- (IBAction)onRealtime:(id)sender {
+    stateInp.isRealtime = !stateInp.isRealtime;
+
+    if (stateInp.isRealtime) {
+        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+    } else {
+        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
+    }
+
+    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
+}
+
 - (IBAction)onTranscribe:(id)sender {
+    if (stateInp.isTranscribing) {
+        return;
+    }
+
    NSLog(@"Processing %d samples", stateInp.n_samples);

+    stateInp.isTranscribing = true;
+
+    // dispatch the model to a background thread
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
        // process captured audio
        // convert I16 to F32
-    for (int i = 0; i < stateInp.n_samples; i++) {
-        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
+        for (int i = 0; i < self->stateInp.n_samples; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
        }

        // run the model
        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

+        // get maximum number of threads on this device (max 8)
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
+
        params.print_realtime   = true;
        params.print_progress   = false;
        params.print_timestamps = true;
        params.print_special    = false;
        params.translate        = false;
        params.language         = "en";
-    params.n_threads        = 4;
+        params.n_threads        = max_threads;
        params.offset_ms        = 0;
+        params.single_segment   = self->stateInp.isRealtime;

        CFTimeInterval startTime = CACurrentMediaTime();

-    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
+        whisper_reset_timings(self->stateInp.ctx);
+
+        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
            NSLog(@"Failed to run the model");
-        _textviewResult.text = @"Failed to run the model";
+            self->_textviewResult.text = @"Failed to run the model";

            return;
        }

+        whisper_print_timings(self->stateInp.ctx);
+
        CFTimeInterval endTime = CACurrentMediaTime();

-    // clear the text in the textview
-    _textviewResult.text = @"";
+        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);

-    int n_segments = whisper_full_n_segments(stateInp.ctx);
+        // result text
+        NSString *result = @"";
+
+        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
        for (int i = 0; i < n_segments; i++) {
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);

-        // append the text to the textview
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+            // append the text to the result
+            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
        }

-    // internal model timing
-    whisper_print_timings(stateInp.ctx);
+        // append processing time
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];

-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
-
-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+        // dispatch the result to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            self->_textviewResult.text = result;
+            self->stateInp.isTranscribing = false;
+        });
+    });
 }

 //
-// Callback implmentation
+// Callback implementation
 //

 void AudioInputCallback(void * inUserData,
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,

    if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
        NSLog(@"Too much audio data, ignoring");
+
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc stopCapturing];
+        });
+
        return;
    }

@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,

    // put the buffer back in the queue
    AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+    if (stateInp->isRealtime) {
+        // dipatch onTranscribe() to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc onTranscribe:nil];
+        });
+    }
 }

@end
--- a/whisper.cpp
+++ b/whisper.cpp
@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
    ctx->t_decode_us = 0;
 }

+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+
+    return s.c_str();
+}
+
 ////////////////////////////////////////////////////////////////////////////

 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
        struct whisper_full_params params,
        const float * samples,
        int n_samples,
-        const int n_processors) {
+        int n_processors) {
    if (n_processors == 1) {
        return whisper_full(ctx, params, samples, n_samples);
    }
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
    return ctx->result_all[i_segment].tokens[i_token].p;
 }

-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-
-    return s.c_str();
-}
-
 // =================================================================================================

 //
--- a/whisper.h
+++ b/whisper.h
@ -169,6 +169,9 @@ extern "C" {
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);
+
    ////////////////////////////////////////////////////////////////////////////

    // Available sampling strategies
@ -248,7 +251,7 @@ extern "C" {
            struct whisper_full_params   params,
                           const float * samples,
                                   int   n_samples,
-            const int n_processors);
+                                   int   n_processors);

    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
@ -275,9 +278,6 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
 #ifdef __cplusplus
 }
 #endif