whisper.objc : add real-time processing (#97)

Similar to the "stream" app
2024-12-20 05:07:52 +00:00 · 2022-11-26 17:28:28 +02:00 · 2022-11-26 17:28:28 +02:00 · e266cb0723
commit e266cb0723
parent c207eed431
5 changed files with 169 additions and 99 deletions
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina6_0" orientation="portrait" appearance="light"/>
    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="System colors in document resources" minToolsVersion="11.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
                                    <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                </connections>
                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+                                <rect key="frame" x="199" y="191" width="156" height="49"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+                                <state key="normal" title="Real-time">
+                                    <color key="titleColor" systemColor="labelColor"/>
+                                </state>
+                                <connections>
+                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+                                </connections>
+                            </button>
                        </subviews>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
                        </constraints>
                    </view>
                    <connections>
+                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                        <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                        <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                        <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@ -20,6 +20,8 @@ typedef struct
 {
    int ggwaveId;
    bool isCapturing;
+    bool isTranscribing;
+    bool isRealtime;
    UILabel * labelReceived;

    AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
    float   * audioBufferF32;

    struct whisper_context * ctx;
+
+    void * vc;
 } StateInp;

@interface ViewController : UIViewController
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
@property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;

@end
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
    }
+
+    stateInp.isTranscribing = false;
+    stateInp.isRealtime = false;
 }

 -(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
    NSLog(@"Start capturing");

    stateInp.n_samples = 0;
+    stateInp.vc = (__bridge void *)(self);

    OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                         AudioInputCallback,
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
 - (IBAction)onTranscribePrepare:(id)sender {
    _textviewResult.text = @"Processing - please wait ...";

-    if (stateInp.isCapturing) {
-        // stop capturing
-        [self stopCapturing];
-
-        return;
+    if (stateInp.isRealtime) {
+        [self onRealtime:(id)sender];
    }
+
+    if (stateInp.isCapturing) {
+        [self stopCapturing];
+    }
+}
+
+- (IBAction)onRealtime:(id)sender {
+    stateInp.isRealtime = !stateInp.isRealtime;
+
+    if (stateInp.isRealtime) {
+        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+    } else {
+        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
+    }
+
+    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
 }

 - (IBAction)onTranscribe:(id)sender {
-    NSLog(@"Processing %d samples", stateInp.n_samples);
-
-    // process captured audio
-    // convert I16 to F32
-    for (int i = 0; i < stateInp.n_samples; i++) {
-        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
-    }
-
-    // run the model
-    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    params.print_realtime   = true;
-    params.print_progress   = false;
-    params.print_timestamps = true;
-    params.print_special    = false;
-    params.translate        = false;
-    params.language         = "en";
-    params.n_threads        = 4;
-    params.offset_ms        = 0;
-
-    CFTimeInterval startTime = CACurrentMediaTime();
-
-    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
-        NSLog(@"Failed to run the model");
-        _textviewResult.text = @"Failed to run the model";
-
+    if (stateInp.isTranscribing) {
        return;
    }

-    CFTimeInterval endTime = CACurrentMediaTime();
+    NSLog(@"Processing %d samples", stateInp.n_samples);

-    // clear the text in the textview
-    _textviewResult.text = @"";
+    stateInp.isTranscribing = true;

-    int n_segments = whisper_full_n_segments(stateInp.ctx);
-    for (int i = 0; i < n_segments; i++) {
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+    // dispatch the model to a background thread
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        // process captured audio
+        // convert I16 to F32
+        for (int i = 0; i < self->stateInp.n_samples; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+        }

-        // append the text to the textview
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
-    }
+        // run the model
+        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-    // internal model timing
-    whisper_print_timings(stateInp.ctx);
+        // get maximum number of threads on this device (max 8)
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);

-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+        params.print_realtime   = true;
+        params.print_progress   = false;
+        params.print_timestamps = true;
+        params.print_special    = false;
+        params.translate        = false;
+        params.language         = "en";
+        params.n_threads        = max_threads;
+        params.offset_ms        = 0;
+        params.single_segment   = self->stateInp.isRealtime;

-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+        CFTimeInterval startTime = CACurrentMediaTime();
+
+        whisper_reset_timings(self->stateInp.ctx);
+
+        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
+            NSLog(@"Failed to run the model");
+            self->_textviewResult.text = @"Failed to run the model";
+
+            return;
+        }
+
+        whisper_print_timings(self->stateInp.ctx);
+
+        CFTimeInterval endTime = CACurrentMediaTime();
+
+        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
+
+        // result text
+        NSString *result = @"";
+
+        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
+        for (int i = 0; i < n_segments; i++) {
+            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
+
+            // append the text to the result
+            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        }
+
+        // append processing time
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+
+        // dispatch the result to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            self->_textviewResult.text = result;
+            self->stateInp.isTranscribing = false;
+        });
+    });
 }

 //
-// Callback implmentation
+// Callback implementation
 //

 void AudioInputCallback(void * inUserData,
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,

    if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
        NSLog(@"Too much audio data, ignoring");
+
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc stopCapturing];
+        });
+
        return;
    }

@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,

    // put the buffer back in the queue
    AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+    if (stateInp->isRealtime) {
+        // dipatch onTranscribe() to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc onTranscribe:nil];
+        });
+    }
 }

@end
--- a/whisper.cpp
+++ b/whisper.cpp
@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
    ctx->t_decode_us = 0;
 }

+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+
+    return s.c_str();
+}
+
 ////////////////////////////////////////////////////////////////////////////

 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
        struct whisper_full_params params,
        const float * samples,
        int n_samples,
-        const int n_processors) {
+        int n_processors) {
    if (n_processors == 1) {
        return whisper_full(ctx, params, samples, n_samples);
    }
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
    return ctx->result_all[i_segment].tokens[i_token].p;
 }

-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-
-    return s.c_str();
-}
-
 // =================================================================================================

 //
--- a/whisper.h
+++ b/whisper.h
@ -72,16 +72,16 @@ extern "C" {
        whisper_token id;  // token id
        whisper_token tid; // forced timestamp token id

-        float p;     // probability of the token
-        float pt;    // probability of the timestamp token
-        float ptsum; // sum of probabilities of all timestamp tokens
+        float p;           // probability of the token
+        float pt;          // probability of the timestamp token
+        float ptsum;       // sum of probabilities of all timestamp tokens

        // token-level timestamp data
        // do not use if you haven't computed token-level timestamps
-        int64_t t0; // start time of the token
-        int64_t t1; //   end time of the token
+        int64_t t0;        // start time of the token
+        int64_t t1;        //   end time of the token

-        float vlen; // voice length of the token
+        float vlen;        // voice length of the token
    } whisper_token_data;

    // Allocates all memory needed for the model and loads the model from the given file.
@ -96,9 +96,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
-            const float * samples,
-            int n_samples,
-            int n_threads);
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);

    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@ -106,9 +106,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_set_mel(
            struct whisper_context * ctx,
-            const float * data,
-            int n_len,
-            int n_mel);
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);

    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@ -116,8 +116,8 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_encode(
            struct whisper_context * ctx,
-            int offset,
-            int n_threads);
+                               int   offset,
+                               int   n_threads);

    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
@ -126,10 +126,10 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
-            const whisper_token * tokens,
-            int n_tokens,
-            int n_past,
-            int n_threads);
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);

    // Token sampling methods.
    // These are provided for convenience and can be used after each call to whisper_decode().
@ -169,6 +169,9 @@ extern "C" {
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);
+
    ////////////////////////////////////////////////////////////////////////////

    // Available sampling strategies
@ -187,12 +190,12 @@ extern "C" {

        int n_threads;
        int n_max_text_ctx;
-        int offset_ms;      // start offset in ms
-        int duration_ms;    // audio duration to process in ms
+        int offset_ms;          // start offset in ms
+        int duration_ms;        // audio duration to process in ms

        bool translate;
        bool no_context;
-        bool single_segment; // force single segment output (useful for streaming)
+        bool single_segment;    // force single segment output (useful for streaming)
        bool print_special;
        bool print_progress;
        bool print_realtime;
@ -206,8 +209,8 @@ extern "C" {
        int   max_tokens;       // max tokens per segment (0 = no limit)

        // [EXPERIMENTAL] speed-up techniques
-        bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
-        int  audio_ctx; // overwrite the audio context size (0 = use default)
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)

        // tokens to provide the whisper model as initial prompt
        // these are prepended to any existing text context from a previous call
@ -235,20 +238,20 @@ extern "C" {
    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);

    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples,
-            const int n_processors);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples,
+                                   int   n_processors);

    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
@ -275,9 +278,6 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
 #ifdef __cplusplus
 }
 #endif