mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-20 05:07:52 +00:00
whisper.objc : add real-time processing (#97)
Similar to the "stream" app
This commit is contained in:
parent
c207eed431
commit
e266cb0723
@ -1,8 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
||||
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
||||
<device id="retina6_0" orientation="portrait" appearance="light"/>
|
||||
<dependencies>
|
||||
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
|
||||
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
|
||||
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
|
||||
<capability name="System colors in document resources" minToolsVersion="11.0"/>
|
||||
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
|
||||
@ -40,7 +40,7 @@
|
||||
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||
<color key="textColor" systemColor="labelColor"/>
|
||||
<fontDescription key="fontDescription" type="system" pointSize="20"/>
|
||||
<fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
|
||||
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
|
||||
</textView>
|
||||
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
|
||||
@ -56,6 +56,18 @@
|
||||
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
|
||||
</connections>
|
||||
</button>
|
||||
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
|
||||
<rect key="frame" x="199" y="191" width="156" height="49"/>
|
||||
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||
<color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
|
||||
<color key="tintColor" systemColor="opaqueSeparatorColor"/>
|
||||
<state key="normal" title="Real-time">
|
||||
<color key="titleColor" systemColor="labelColor"/>
|
||||
</state>
|
||||
<connections>
|
||||
<action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
|
||||
</connections>
|
||||
</button>
|
||||
</subviews>
|
||||
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
|
||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||
@ -64,6 +76,7 @@
|
||||
</constraints>
|
||||
</view>
|
||||
<connections>
|
||||
<outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
|
||||
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
|
||||
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
|
||||
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
|
||||
|
@ -20,6 +20,8 @@ typedef struct
|
||||
{
|
||||
int ggwaveId;
|
||||
bool isCapturing;
|
||||
bool isTranscribing;
|
||||
bool isRealtime;
|
||||
UILabel * labelReceived;
|
||||
|
||||
AudioQueueRef queue;
|
||||
@ -31,6 +33,8 @@ typedef struct
|
||||
float * audioBufferF32;
|
||||
|
||||
struct whisper_context * ctx;
|
||||
|
||||
void * vc;
|
||||
} StateInp;
|
||||
|
||||
@interface ViewController : UIViewController
|
||||
|
@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
|
||||
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
|
||||
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
|
||||
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
|
||||
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
|
||||
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
|
||||
|
||||
@end
|
||||
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
|
||||
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
||||
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
||||
}
|
||||
|
||||
stateInp.isTranscribing = false;
|
||||
stateInp.isRealtime = false;
|
||||
}
|
||||
|
||||
-(IBAction) stopCapturing {
|
||||
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
|
||||
NSLog(@"Start capturing");
|
||||
|
||||
stateInp.n_samples = 0;
|
||||
stateInp.vc = (__bridge void *)(self);
|
||||
|
||||
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
|
||||
AudioInputCallback,
|
||||
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
|
||||
- (IBAction)onTranscribePrepare:(id)sender {
|
||||
_textviewResult.text = @"Processing - please wait ...";
|
||||
|
||||
if (stateInp.isCapturing) {
|
||||
// stop capturing
|
||||
[self stopCapturing];
|
||||
|
||||
return;
|
||||
if (stateInp.isRealtime) {
|
||||
[self onRealtime:(id)sender];
|
||||
}
|
||||
|
||||
if (stateInp.isCapturing) {
|
||||
[self stopCapturing];
|
||||
}
|
||||
}
|
||||
|
||||
- (IBAction)onRealtime:(id)sender {
|
||||
stateInp.isRealtime = !stateInp.isRealtime;
|
||||
|
||||
if (stateInp.isRealtime) {
|
||||
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
|
||||
} else {
|
||||
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
|
||||
}
|
||||
|
||||
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
|
||||
}
|
||||
|
||||
- (IBAction)onTranscribe:(id)sender {
|
||||
NSLog(@"Processing %d samples", stateInp.n_samples);
|
||||
|
||||
// process captured audio
|
||||
// convert I16 to F32
|
||||
for (int i = 0; i < stateInp.n_samples; i++) {
|
||||
stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
|
||||
}
|
||||
|
||||
// run the model
|
||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = false;
|
||||
params.language = "en";
|
||||
params.n_threads = 4;
|
||||
params.offset_ms = 0;
|
||||
|
||||
CFTimeInterval startTime = CACurrentMediaTime();
|
||||
|
||||
if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
|
||||
NSLog(@"Failed to run the model");
|
||||
_textviewResult.text = @"Failed to run the model";
|
||||
|
||||
if (stateInp.isTranscribing) {
|
||||
return;
|
||||
}
|
||||
|
||||
CFTimeInterval endTime = CACurrentMediaTime();
|
||||
NSLog(@"Processing %d samples", stateInp.n_samples);
|
||||
|
||||
// clear the text in the textview
|
||||
_textviewResult.text = @"";
|
||||
stateInp.isTranscribing = true;
|
||||
|
||||
int n_segments = whisper_full_n_segments(stateInp.ctx);
|
||||
for (int i = 0; i < n_segments; i++) {
|
||||
const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
|
||||
// dispatch the model to a background thread
|
||||
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
||||
// process captured audio
|
||||
// convert I16 to F32
|
||||
for (int i = 0; i < self->stateInp.n_samples; i++) {
|
||||
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
|
||||
}
|
||||
|
||||
// append the text to the textview
|
||||
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
||||
}
|
||||
// run the model
|
||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
// internal model timing
|
||||
whisper_print_timings(stateInp.ctx);
|
||||
// get maximum number of threads on this device (max 8)
|
||||
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
|
||||
|
||||
NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = false;
|
||||
params.language = "en";
|
||||
params.n_threads = max_threads;
|
||||
params.offset_ms = 0;
|
||||
params.single_segment = self->stateInp.isRealtime;
|
||||
|
||||
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
|
||||
CFTimeInterval startTime = CACurrentMediaTime();
|
||||
|
||||
whisper_reset_timings(self->stateInp.ctx);
|
||||
|
||||
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
|
||||
NSLog(@"Failed to run the model");
|
||||
self->_textviewResult.text = @"Failed to run the model";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
whisper_print_timings(self->stateInp.ctx);
|
||||
|
||||
CFTimeInterval endTime = CACurrentMediaTime();
|
||||
|
||||
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
|
||||
|
||||
// result text
|
||||
NSString *result = @"";
|
||||
|
||||
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
|
||||
for (int i = 0; i < n_segments; i++) {
|
||||
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
|
||||
|
||||
// append the text to the result
|
||||
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
||||
}
|
||||
|
||||
// append processing time
|
||||
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
|
||||
|
||||
// dispatch the result to the main thread
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
self->_textviewResult.text = result;
|
||||
self->stateInp.isTranscribing = false;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Callback implmentation
|
||||
// Callback implementation
|
||||
//
|
||||
|
||||
void AudioInputCallback(void * inUserData,
|
||||
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
|
||||
|
||||
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
|
||||
NSLog(@"Too much audio data, ignoring");
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
||||
[vc stopCapturing];
|
||||
});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
|
||||
|
||||
// put the buffer back in the queue
|
||||
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
|
||||
|
||||
if (stateInp->isRealtime) {
|
||||
// dipatch onTranscribe() to the main thread
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
||||
[vc onTranscribe:nil];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@end
|
||||
|
32
whisper.cpp
32
whisper.cpp
@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
||||
ctx->t_decode_us = 0;
|
||||
}
|
||||
|
||||
const char * whisper_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
||||
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
|
||||
struct whisper_full_params params,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
const int n_processors) {
|
||||
int n_processors) {
|
||||
if (n_processors == 1) {
|
||||
return whisper_full(ctx, params, samples, n_samples);
|
||||
}
|
||||
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
||||
return ctx->result_all[i_segment].tokens[i_token].p;
|
||||
}
|
||||
|
||||
const char * whisper_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
//
|
||||
|
70
whisper.h
70
whisper.h
@ -72,16 +72,16 @@ extern "C" {
|
||||
whisper_token id; // token id
|
||||
whisper_token tid; // forced timestamp token id
|
||||
|
||||
float p; // probability of the token
|
||||
float pt; // probability of the timestamp token
|
||||
float ptsum; // sum of probabilities of all timestamp tokens
|
||||
float p; // probability of the token
|
||||
float pt; // probability of the timestamp token
|
||||
float ptsum; // sum of probabilities of all timestamp tokens
|
||||
|
||||
// token-level timestamp data
|
||||
// do not use if you haven't computed token-level timestamps
|
||||
int64_t t0; // start time of the token
|
||||
int64_t t1; // end time of the token
|
||||
int64_t t0; // start time of the token
|
||||
int64_t t1; // end time of the token
|
||||
|
||||
float vlen; // voice length of the token
|
||||
float vlen; // voice length of the token
|
||||
} whisper_token_data;
|
||||
|
||||
// Allocates all memory needed for the model and loads the model from the given file.
|
||||
@ -96,9 +96,9 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
WHISPER_API int whisper_pcm_to_mel(
|
||||
struct whisper_context * ctx,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
|
||||
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
||||
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||
@ -106,9 +106,9 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
WHISPER_API int whisper_set_mel(
|
||||
struct whisper_context * ctx,
|
||||
const float * data,
|
||||
int n_len,
|
||||
int n_mel);
|
||||
const float * data,
|
||||
int n_len,
|
||||
int n_mel);
|
||||
|
||||
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
|
||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||
@ -116,8 +116,8 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
WHISPER_API int whisper_encode(
|
||||
struct whisper_context * ctx,
|
||||
int offset,
|
||||
int n_threads);
|
||||
int offset,
|
||||
int n_threads);
|
||||
|
||||
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
||||
// Make sure to call whisper_encode() first.
|
||||
@ -126,10 +126,10 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
WHISPER_API int whisper_decode(
|
||||
struct whisper_context * ctx,
|
||||
const whisper_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
const whisper_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Token sampling methods.
|
||||
// These are provided for convenience and can be used after each call to whisper_decode().
|
||||
@ -169,6 +169,9 @@ extern "C" {
|
||||
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
||||
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
||||
|
||||
// Print system information
|
||||
WHISPER_API const char * whisper_print_system_info(void);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Available sampling strategies
|
||||
@ -187,12 +190,12 @@ extern "C" {
|
||||
|
||||
int n_threads;
|
||||
int n_max_text_ctx;
|
||||
int offset_ms; // start offset in ms
|
||||
int duration_ms; // audio duration to process in ms
|
||||
int offset_ms; // start offset in ms
|
||||
int duration_ms; // audio duration to process in ms
|
||||
|
||||
bool translate;
|
||||
bool no_context;
|
||||
bool single_segment; // force single segment output (useful for streaming)
|
||||
bool single_segment; // force single segment output (useful for streaming)
|
||||
bool print_special;
|
||||
bool print_progress;
|
||||
bool print_realtime;
|
||||
@ -206,8 +209,8 @@ extern "C" {
|
||||
int max_tokens; // max tokens per segment (0 = no limit)
|
||||
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
||||
int audio_ctx; // overwrite the audio context size (0 = use default)
|
||||
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
||||
int audio_ctx; // overwrite the audio context size (0 = use default)
|
||||
|
||||
// tokens to provide the whisper model as initial prompt
|
||||
// these are prepended to any existing text context from a previous call
|
||||
@ -235,20 +238,20 @@ extern "C" {
|
||||
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
// Uses the specified decoding strategy to obtain the text.
|
||||
WHISPER_API int whisper_full(
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_full_params params,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_full_params params,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
||||
// It seems this approach can offer some speedup in some cases.
|
||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
WHISPER_API int whisper_full_parallel(
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_full_params params,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
const int n_processors);
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_full_params params,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_processors);
|
||||
|
||||
// Number of generated text segments.
|
||||
// A segment can be a few words, a sentence, or even a paragraph.
|
||||
@ -275,9 +278,6 @@ extern "C" {
|
||||
// Get the probability of the specified token in the specified segment.
|
||||
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
||||
|
||||
// Print system information
|
||||
WHISPER_API const char * whisper_print_system_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user