wip ignore

2025-06-27 10:22:03 +00:00 · 2023-02-15 19:11:12 +02:00
71 changed files with 2994 additions and 8159 deletions
--- a/.github/workflows/bindings-ruby.yml
+++ b/.github/workflows/bindings-ruby.yml
@ -1,22 +0,0 @@
-name: Bindings Tests (Ruby)
-on:
-  push:
-    paths:
-      - bindings/ruby/**
-      - whisper.h
-  pull_request:
-    paths:
-      - bindings/ruby/**
-      - whisper.h
-
-jobs:
-  ubuntu-latest:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: ruby/setup-ruby@v1
-        with:
-          ruby-version: '3.0'
-      - uses: actions/checkout@v1
-      - run: |
-          cd bindings/ruby/ext
-          ruby extconf.rb && make
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@ -1,4 +1,4 @@
-name: Bindings Tests (Go)
+name: Bindings Tests
 on:
  push:
    paths:
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,5 @@
 *.o
 *.a
-*.mlmodel
-*.mlmodelc
 .cache/
 .vs/
 .vscode/
@ -12,7 +10,6 @@ build-em/
 build-debug/
 build-release/
 build-static/
-build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

@ -34,5 +31,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt
-
-*.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.2.1)
+project(whisper.cpp VERSION 1.2.0)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -54,8 +54,6 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
-
-    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -88,33 +86,16 @@ endif()

 find_package(Threads REQUIRED)

-# on APPLE
-if (APPLE)
-    # include Accelerate framework
-    if (NOT WHISPER_NO_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
+# on APPLE - include Accelerate framework
+if (APPLE AND NOT WHISPER_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")

-        if (ACCELERATE_FRAMEWORK)
-            message(STATUS "Accelerate framework found")
-
-            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-        else()
-            message(WARNING "Accelerate framework not found")
-        endif()
-    endif()
-
-    if (WHISPER_COREML)
-        find_library(FOUNDATION_FRAMEWORK Foundation)
-        find_library(COREML_FRAMEWORK CoreML)
-
-        if (COREML_FRAMEWORK)
-            message(STATUS "CoreML framework found")
-
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
-        else()
-            message(WARNING "CoreML framework not found")
-        endif()
+        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
    endif()
 endif()

@ -191,9 +172,7 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
-            if(NOT WHISPER_NO_F16C)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
        endif()
    endif()
 endif()
@ -202,33 +181,6 @@ if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()

-#
-# whisper.coreml - Core ML support
-#
-
-if (WHISPER_COREML)
-    set(TARGET whisper.coreml)
-
-    add_library(${TARGET}
-        coreml/whisper-encoder.h
-        coreml/whisper-encoder.mm
-        coreml/whisper-encoder-impl.h
-        coreml/whisper-encoder-impl.m
-        )
-
-    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PUBLIC
-        .
-        )
-
-    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
-
-    set_target_properties(${TARGET} PROPERTIES
-        COMPILE_FLAGS "-fobjc-arc"
-        )
-endif()
-
 #
 # whisper - this is the main library of the project
 #
@ -248,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
    .
    )

-if (WHISPER_COREML)
-    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
-endif()
-
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

--- a/59
+++ b/59
@ -30,16 +30,10 @@ endif
 # Compile flags
 #

-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =

-# ref: https://github.com/ggerganov/whisper.cpp/issues/37
-ifneq ($(wildcard /usr/include/musl/*),)
-	CFLAGS   += -D_POSIX_SOURCE -D_GNU_SOURCE
-	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-endif
-
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -138,10 +132,6 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
-ifdef WHISPER_COREML
-	CXXFLAGS += -DWHISPER_USE_COREML
-	LDFLAGS  += -framework Foundation -framework CoreML
-endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -151,8 +141,6 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
-	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
@ -194,23 +182,11 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o

-ifndef WHISPER_COREML
-WHISPER_OBJ = whisper.o
-else
-whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
-	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
+libwhisper.a: ggml.o whisper.o
+	$(AR) rcs libwhisper.a ggml.o whisper.o

-whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
-	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
-
-WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
-endif
-
-libwhisper.a: ggml.o $(WHISPER_OBJ)
-	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
-
-libwhisper.so: ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
+libwhisper.so: ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
 	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -221,24 +197,21 @@ clean:

 CC_SDL=`sdl2-config --cflags --libs`

-SRC_COMMON = examples/common.cpp
-SRC_COMMON_SDL = examples/common-sdl.cpp
-
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h

-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
+stream: examples/stream/stream.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)

-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
+talk: examples/talk/talk.cpp  examples/talk/gpt-2.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)

-bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
+bench: examples/bench/bench.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -433,19 +433,6 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 ---

-## Video comparison of different models
-
-Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
-
-```java
-./extra/bench-wts.sh samples/jfk.wav
-ffplay ./samples/jfk.wav.all.mp4
-```
-
-https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
-
---
-
 ## Benchmarks

 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -466,7 +453,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,17 +463,12 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
-  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
+- [X] .NET:
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
-  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
-  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)

 ## Examples

--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,9 +17,9 @@ import (
 // CONSTANTS

 const (
-	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                      // Filename extension
-	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
+	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcExt  = ".bin"                                                               // Filename extension
+	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
 )

 var (
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -94,7 +94,6 @@ func (model *model) NewContext() (Context, error) {
 	params.SetPrintRealtime(false)
 	params.SetPrintTimestamps(false)
 	params.SetThreads(runtime.NumCPU())
-	params.SetNoContext(true)

 	// Return new context
 	return newContext(model, params)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -20,7 +20,7 @@ extern bool callEncoderBegin(void* user_data);
 // Text segment callback
 // Called on every newly generated text segment
 // Use the whisper_full_...() functions to obtain the text segments
-static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
+static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        callNewSegment(user_data, n_new);
    }
@ -29,7 +29,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_s
 // Encoder begin callback
 // If not NULL, called before the encoder starts
 // If it returns false, the computation is aborted
-static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        return callEncoderBegin(user_data);
    }
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.1",
+  "version": "1.2.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -1,7 +0,0 @@
-Makefile
-ggml.c
-ggml.h
-whisper.bundle
-whisper.cpp
-whisper.h
-dr_wav.h
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,21 +0,0 @@
-require 'mkmf'
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
-
-
-# need to use c++ compiler flags
-$CXXFLAGS << ' -std=c++11'
-# Set to true when building binary gems
-if enable_config('static-stdlib', false)
-  $LDFLAGS << ' -static-libgcc -static-libstdc++'
-end
-
-if enable_config('march-tune-native', false)
-  $CFLAGS << ' -march=native -mtune=native'
-  $CXXFLAGS << ' -march=native -mtune=native'
-end
-
-create_makefile('whisper')
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -1,426 +0,0 @@
-#include <ruby.h>
-#include "ruby_whisper.h"
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-#include <cmath>
-#include <fstream>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <vector>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BOOL_PARAMS_SETTER(self, prop, value) \
-  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
-  if (value == Qfalse || value == Qnil) { \
-    rwp->params.prop = false; \
-  } else { \
-    rwp->params.prop = true; \
-  } \
-  return value; \
-
-#define BOOL_PARAMS_GETTER(self,  prop) \
-  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
-  if (rwp->params.prop) { \
-    return Qtrue; \
-  } else { \
-    return Qfalse; \
-  }
-
-VALUE mWhisper;
-VALUE cContext;
-VALUE cParams;
-
-static void ruby_whisper_free(ruby_whisper *rw) {
-  if (rw->context) {
-    whisper_free(rw->context);
-    rw->context = NULL;
-  }
-}
-static void ruby_whisper_params_free(ruby_whisper_params *rwp) {
-}
-
-void rb_whisper_mark(ruby_whisper *rw) {
-  // call rb_gc_mark on any ruby references in rw
-}
-
-void rb_whisper_free(ruby_whisper *rw) {
-  ruby_whisper_free(rw);
-  free(rw);
-}
-
-void rb_whisper_params_mark(ruby_whisper_params *rwp) {
-}
-
-void rb_whisper_params_free(ruby_whisper_params *rwp) {
-  ruby_whisper_params_free(rwp);
-  free(rwp);
-}
-
-static VALUE ruby_whisper_allocate(VALUE klass) {
-  ruby_whisper *rw;
-  rw = ALLOC(ruby_whisper);
-  rw->context = NULL;
-  return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
-}
-
-static VALUE ruby_whisper_params_allocate(VALUE klass) {
-  ruby_whisper_params *rwp;
-  rwp = ALLOC(ruby_whisper_params);
-  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-  return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
-}
-
-static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
-  ruby_whisper *rw;
-  VALUE whisper_model_file_path;
-
-  // TODO: we can support init from buffer here too maybe another ruby object to expose
-  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
-  Data_Get_Struct(self, ruby_whisper, rw);
-
-  if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
-    rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
-  }
-  rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
-  if (rw->context == nullptr) {
-    rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
-  }
-  return self;
-}
-
-/*
- * transcribe a single file
- * can emit to a block results
- *
- **/
-static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
-  ruby_whisper *rw;
-  ruby_whisper_params *rwp;
-  VALUE wave_file_path, blk, params;
-
-  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
-  Data_Get_Struct(self, ruby_whisper, rw);
-  Data_Get_Struct(params, ruby_whisper_params, rwp);
-
-  if (!rb_respond_to(wave_file_path, rb_intern("to_s"))) {
-    rb_raise(rb_eRuntimeError, "Expected file path to wave file");
-  }
-
-  std::string fname_inp = StringValueCStr(wave_file_path);
-
-  std::vector<float> pcmf32; // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-  // WAV input - this is directly from main.cpp example
-  {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname_inp == "-") {
-      {
-        uint8_t buf[1024];
-        while (true) {
-          const size_t n = fread(buf, 1, sizeof(buf), stdin);
-          if (n == 0) {
-            break;
-          }
-          wav_data.insert(wav_data.end(), buf, buf + n);
-        }
-      }
-
-      if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open WAV file from stdin\n");
-        return self;
-      }
-
-      fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    } else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
-      fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-      fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) {
-      fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-      fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
-      return self;
-    }
-
-    if (wav.bitsPerSample != 16) {
-      fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str());
-      return self;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[i])/32768.0f;
-      }
-    } else {
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-      }
-    }
-
-    if (rwp->diarize) {
-      // convert to stereo, float
-      pcmf32s.resize(2);
-
-      pcmf32s[0].resize(n);
-      pcmf32s[1].resize(n);
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-        pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-      }
-    }
-  }
-  {
-    static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-      bool is_aborted = *(bool*)user_data;
-      return !is_aborted;
-    };
-    rwp->params.encoder_begin_callback_user_data = &is_aborted;
-  }
-
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
-    fprintf(stderr, "failed to process audio\n");
-    return self;
-  }
-  const int n_segments = whisper_full_n_segments(rw->context);
-  VALUE output = rb_str_new2("");
-  for (int i = 0; i < n_segments; ++i) {
-    const char * text = whisper_full_get_segment_text(rw->context, i);
-    output = rb_str_concat(output, rb_str_new2(text));
-  }
-  VALUE idCall = rb_intern("call");
-  if (blk != Qnil) {
-    rb_funcall(blk, idCall, 1, output);
-  }
-  return self;
-}
-
-/*
- * params.language = "auto" | "en", etc...
- */
-static VALUE ruby_whisper_params_set_language(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (value == Qfalse || value == Qnil) {
-    rwp->params.language = "auto";
-  } else {
-    rwp->params.language = StringValueCStr(value);
-  }
-  return value;
-}
-static VALUE ruby_whisper_params_get_language(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (rwp->params.language) {
-    return rb_str_new2(rwp->params.language);
-  } else {
-    return rb_str_new2("auto");
-  }
-}
-static VALUE ruby_whisper_params_set_translate(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, translate, value)
-}
-static VALUE ruby_whisper_params_get_translate(VALUE self) {
-  BOOL_PARAMS_GETTER(self, translate)
-}
-static VALUE ruby_whisper_params_set_no_context(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, no_context, value)
-}
-static VALUE ruby_whisper_params_get_no_context(VALUE self) {
-  BOOL_PARAMS_GETTER(self, no_context)
-}
-static VALUE ruby_whisper_params_set_single_segment(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, single_segment, value)
-}
-static VALUE ruby_whisper_params_get_single_segment(VALUE self) {
-  BOOL_PARAMS_GETTER(self, single_segment)
-}
-static VALUE ruby_whisper_params_set_print_special(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_special, value)
-}
-static VALUE ruby_whisper_params_get_print_special(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_special)
-}
-static VALUE ruby_whisper_params_set_print_progress(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_progress, value)
-}
-static VALUE ruby_whisper_params_get_print_progress(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_progress)
-}
-static VALUE ruby_whisper_params_set_print_realtime(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_realtime, value)
-}
-static VALUE ruby_whisper_params_get_print_realtime(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_realtime)
-}
-static VALUE ruby_whisper_params_set_print_timestamps(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_timestamps, value)
-}
-static VALUE ruby_whisper_params_get_print_timestamps(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_timestamps)
-}
-static VALUE ruby_whisper_params_set_suppress_blank(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, suppress_blank, value)
-}
-static VALUE ruby_whisper_params_get_suppress_blank(VALUE self) {
-  BOOL_PARAMS_GETTER(self, suppress_blank)
-}
-static VALUE ruby_whisper_params_set_suppress_non_speech_tokens(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, suppress_non_speech_tokens, value)
-}
-static VALUE ruby_whisper_params_get_suppress_non_speech_tokens(VALUE self) {
-  BOOL_PARAMS_GETTER(self, suppress_non_speech_tokens)
-}
-static VALUE ruby_whisper_params_get_token_timestamps(VALUE self) {
-  BOOL_PARAMS_GETTER(self, token_timestamps)
-}
-static VALUE ruby_whisper_params_set_token_timestamps(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, token_timestamps, value)
-}
-static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
-  BOOL_PARAMS_GETTER(self, split_on_word)
-}
-static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, split_on_word, value)
-}
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
-static VALUE ruby_whisper_params_get_diarize(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (rwp->diarize) {
-    return Qtrue;
-  } else {
-    return Qfalse;
-  }
-}
-static VALUE ruby_whisper_params_set_diarize(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (value == Qfalse || value == Qnil) {
-    rwp->diarize = false;
-  } else {
-    rwp->diarize = true;
-  } \
-  return value;
-}
-
-static VALUE ruby_whisper_params_get_offset(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.offset_ms);
-}
-static VALUE ruby_whisper_params_set_offset(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.offset_ms = NUM2INT(value);
-  return value;
-}
-static VALUE ruby_whisper_params_get_duration(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.duration_ms);
-}
-static VALUE ruby_whisper_params_set_duration(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.duration_ms = NUM2INT(value);
-  return value;
-}
-
-static VALUE ruby_whisper_params_get_max_text_tokens(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.n_max_text_ctx);
-}
-static VALUE ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.n_max_text_ctx = NUM2INT(value);
-  return value;
-}
-
-void Init_whisper() {
-  mWhisper = rb_define_module("Whisper");
-  cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
-  cParams  = rb_define_class_under(mWhisper, "Params", rb_cObject);
-
-  rb_define_alloc_func(cContext, ruby_whisper_allocate);
-  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
-
-  rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
-
-  rb_define_alloc_func(cParams, ruby_whisper_params_allocate);
-
-  rb_define_method(cParams, "language=", ruby_whisper_params_set_language, 1);
-  rb_define_method(cParams, "language", ruby_whisper_params_get_language, 0);
-  rb_define_method(cParams, "translate=", ruby_whisper_params_set_translate, 1);
-  rb_define_method(cParams, "translate", ruby_whisper_params_get_translate, 0);
-  rb_define_method(cParams, "no_context=", ruby_whisper_params_set_no_context, 1);
-  rb_define_method(cParams, "no_context", ruby_whisper_params_get_no_context, 0);
-  rb_define_method(cParams, "single_segment=", ruby_whisper_params_set_single_segment, 1);
-  rb_define_method(cParams, "single_segment", ruby_whisper_params_get_single_segment, 0);
-  rb_define_method(cParams, "print_special", ruby_whisper_params_get_print_special, 0);
-  rb_define_method(cParams, "print_special=", ruby_whisper_params_set_print_special, 1);
-  rb_define_method(cParams, "print_progress", ruby_whisper_params_get_print_progress, 0);
-  rb_define_method(cParams, "print_progress=", ruby_whisper_params_set_print_progress, 1);
-  rb_define_method(cParams, "print_realtime", ruby_whisper_params_get_print_realtime, 0);
-  rb_define_method(cParams, "print_realtime=", ruby_whisper_params_set_print_realtime, 1);
-  rb_define_method(cParams, "print_timestamps", ruby_whisper_params_get_print_timestamps, 0);
-  rb_define_method(cParams, "print_timestamps=", ruby_whisper_params_set_print_timestamps, 1);
-  rb_define_method(cParams, "suppress_blank", ruby_whisper_params_get_suppress_blank, 0);
-  rb_define_method(cParams, "suppress_blank=", ruby_whisper_params_set_suppress_blank, 1);
-  rb_define_method(cParams, "suppress_non_speech_tokens", ruby_whisper_params_get_suppress_non_speech_tokens, 0);
-  rb_define_method(cParams, "suppress_non_speech_tokens=", ruby_whisper_params_set_suppress_non_speech_tokens, 1);
-  rb_define_method(cParams, "token_timestamps", ruby_whisper_params_get_token_timestamps, 0);
-  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
-  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
-  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
-  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
-  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
-
-  rb_define_method(cParams, "offset", ruby_whisper_params_get_offset, 0);
-  rb_define_method(cParams, "offset=", ruby_whisper_params_set_offset, 1);
-  rb_define_method(cParams, "duration", ruby_whisper_params_get_duration, 0);
-  rb_define_method(cParams, "duration=", ruby_whisper_params_set_duration, 1);
-
-  rb_define_method(cParams, "max_text_tokens", ruby_whisper_params_get_max_text_tokens, 0);
-  rb_define_method(cParams, "max_text_tokens=", ruby_whisper_params_set_max_text_tokens, 1);
-}
-#ifdef __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -1,15 +0,0 @@
-#ifndef __RUBY_WHISPER_H
-#define __RUBY_WHISPER_H
-
-#include "whisper.h"
-
-typedef struct {
-  struct whisper_context *context;
-} ruby_whisper;
-
-typedef struct {
-  struct whisper_full_params params;
-  bool diarize;
-} ruby_whisper_params;
-
-#endif
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -1,138 +0,0 @@
-TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
-EXTDIR = File.join(TOPDIR, 'ext')
-#$LIBDIR = File.join(TOPDIR, 'lib')
-#$:.unshift(LIBDIR)
-$:.unshift(EXTDIR)
-
-require 'whisper'
-require 'test/unit'
-
-class TestWhisper < Test::Unit::TestCase
-  def setup
-    @params  = Whisper::Params.new
-  end
-
-  def test_language
-    @params.language = "en"
-    assert_equal @params.language, "en"
-    @params.language = "auto"
-    assert_equal @params.language, "auto"
-  end
-
-  def test_offset
-    @params.offset = 10_000
-    assert_equal @params.offset, 10_000
-    @params.offset = 0
-    assert_equal @params.offset, 0
-  end
-
-  def test_duration
-    @params.duration = 60_000
-    assert_equal @params.duration, 60_000
-    @params.duration = 0
-    assert_equal @params.duration, 0
-  end
-
-  def test_max_text_tokens
-    @params.max_text_tokens = 300
-    assert_equal @params.max_text_tokens, 300
-    @params.max_text_tokens = 0
-    assert_equal @params.max_text_tokens, 0
-  end
-
-  def test_translate
-    @params.translate = true
-    assert @params.translate
-    @params.translate = false
-    assert !@params.translate
-  end
-
-  def test_no_context
-    @params.no_context = true
-    assert @params.no_context
-    @params.no_context = false
-    assert !@params.no_context
-  end
-
-  def test_single_segment
-    @params.single_segment = true
-    assert @params.single_segment
-    @params.single_segment = false
-    assert !@params.single_segment
-  end
-
-  def test_print_special
-    @params.print_special = true
-    assert @params.print_special
-    @params.print_special = false
-    assert !@params.print_special
-  end
-
-  def test_print_progress
-    @params.print_progress = true
-    assert @params.print_progress
-    @params.print_progress = false
-    assert !@params.print_progress
-  end
-
-  def test_print_realtime
-    @params.print_realtime = true
-    assert @params.print_realtime
-    @params.print_realtime = false
-    assert !@params.print_realtime
-  end
-
-  def test_print_timestamps
-    @params.print_timestamps = true
-    assert @params.print_timestamps
-    @params.print_timestamps = false
-    assert !@params.print_timestamps
-  end
-
-  def test_suppress_blank
-    @params.suppress_blank = true
-    assert @params.suppress_blank
-    @params.suppress_blank = false
-    assert !@params.suppress_blank
-  end
-
-  def test_suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = true
-    assert @params.suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = false
-    assert !@params.suppress_non_speech_tokens
-  end
-
-  def test_token_timestamps
-    @params.token_timestamps = true
-    assert @params.token_timestamps
-    @params.token_timestamps = false
-    assert !@params.token_timestamps
-  end
-
-  def test_split_on_word
-    @params.split_on_word = true
-    assert @params.split_on_word
-    @params.split_on_word = false
-    assert !@params.split_on_word
-  end
-
-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
-  def test_whisper
-    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
-    params  = Whisper::Params.new
-    params.print_timestamps = false
-
-    jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
-    @whisper.transcribe(jfk, params) {|text|
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, text
-    }
-  end
-
-end
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -1,142 +0,0 @@
-//
-// CoremlEncoder.h
-//
-// This file was automatically generated and should not be edited.
-//
-
-#import <Foundation/Foundation.h>
-#import <CoreML/CoreML.h>
-#include <stdint.h>
-#include <os/log.h>
-
-NS_ASSUME_NONNULL_BEGIN
-
-
-/// Model Prediction Input Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
-
-/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
-
-@end
-
-
-/// Model Prediction Output Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
-
-/// output as multidimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * output;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
-
-@end
-
-
-/// Class for model loading and prediction
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoder : NSObject
-@property (readonly, nonatomic, nullable) MLModel * model;
-
-/**
-    URL of the underlying .mlmodelc directory.
-*/
-+ (nullable NSURL *)URLOfModelInThisBundle;
-
-/**
-    Initialize CoremlEncoder instance from an existing MLModel object.
-
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
-    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
-*/
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-*/
- (nullable instancetype)init;
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Construct CoremlEncoder instance asynchronously with configuration.
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
-
-/**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
-
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param modelURL The model URL.
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
-
-/**
-    Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
-    @param options prediction options
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Make a prediction using the convenience interface
-    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Batch prediction
-    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
-    @param options prediction options
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the predictions as NSArray<CoremlEncoderOutput *>
-*/
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-@end
-
-NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -1,197 +0,0 @@
-//
-// CoremlEncoder.m
-//
-// This file was automatically generated and should not be edited.
-//
-
-#if !__has_feature(objc_arc)
-#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
-#endif
-
-#import "whisper-encoder-impl.h"
-
-@implementation CoremlEncoderInput
-
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
-    self = [super init];
-    if (self) {
-        _melSegment = melSegment;
-    }
-    return self;
-}
-
- (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"melSegment"]];
-}
-
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"melSegment"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
-    }
-    return nil;
-}
-
-@end
-
-@implementation CoremlEncoderOutput
-
- (instancetype)initWithOutput:(MLMultiArray *)output {
-    self = [super init];
-    if (self) {
-        _output = output;
-    }
-    return self;
-}
-
- (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"output"]];
-}
-
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"output"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.output];
-    }
-    return nil;
-}
-
-@end
-
-@implementation CoremlEncoder
-
-
-/**
-    URL of the underlying .mlmodelc directory.
-*/
-+ (nullable NSURL *)URLOfModelInThisBundle {
-    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
-    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
-    return [NSURL fileURLWithPath:assetPath];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from an existing MLModel object.
-
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
-    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
-*/
- (instancetype)initWithMLModel:(MLModel *)model {
-    self = [super init];
-    if (!self) { return nil; }
-    _model = model;
-    if (_model == nil) { return nil; }
-    return self;
-}
-
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-*/
- (nullable instancetype)init {
-    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
-}
-
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
-    if (model == nil) { return nil; }
-    return [self initWithMLModel:model];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
-    if (model == nil) { return nil; }
-    return [self initWithMLModel:model];
-}
-
-
-/**
-    Construct CoremlEncoder instance asynchronously with configuration.
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
-    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
-              configuration:configuration
-          completionHandler:handler];
-}
-
-
-/**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
-
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param modelURL The model URL.
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
-    [MLModel loadContentsOfURL:modelURL
-                 configuration:configuration
-             completionHandler:^(MLModel *model, NSError *error) {
-        if (model != nil) {
-            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
-            handler(typedModel, nil);
-        } else {
-            handler(nil, error);
-        }
-    }];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
-    if (!outFeatures) { return nil; }
-    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
-    return [self predictionFromFeatures:input_ error:error];
-}
-
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
-    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
-    if (!outBatch) { return nil; }
-    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
-    for (NSInteger i = 0; i < outBatch.count; i++) {
-        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
-        [results addObject:result];
-    }
-    return results;
-}
-
-@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -1,22 +0,0 @@
-// Wrapper of the Core ML Whisper Encoder model
-//
-// Code is derived from the work of Github user @wangchou
-// ref: https://github.com/wangchou/callCoreMLFromCpp
-
-#if __cplusplus
-extern "C" {
-#endif
-
-struct whisper_coreml_context;
-
-struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
-void whisper_coreml_free(struct whisper_coreml_context * ctx);
-
-void whisper_coreml_encode(
-        const whisper_coreml_context * ctx,
-                               float * mel,
-                               float * out);
-
-#if __cplusplus
-}
-#endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -1,61 +0,0 @@
-#import "coreml/whisper-encoder.h"
-#import "coreml/whisper-encoder-impl.h"
-
-#import <CoreML/CoreML.h>
-
-#include <stdlib.h>
-
-#if __cplusplus
-extern "C" {
-#endif
-
-struct whisper_coreml_context {
-    const void * data;
-};
-
-struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
-    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
-
-    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
-
-    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
-
-    if (data == NULL) {
-        return NULL;
-    }
-
-    whisper_coreml_context * ctx = new whisper_coreml_context;
-
-    ctx->data = data;
-
-    return ctx;
-}
-
-void whisper_coreml_free(struct whisper_coreml_context * ctx) {
-    CFRelease(ctx->data);
-    delete ctx;
-}
-
-void whisper_coreml_encode(
-        const whisper_coreml_context * ctx,
-                               float * mel,
-                               float * out) {
-    MLMultiArray * inMultiArray = [
-        [MLMultiArray alloc] initWithDataPointer: mel
-                                           shape: @[@1, @80, @3000]
-                                        dataType: MLMultiArrayDataTypeFloat32
-                                         strides: @[@(240000), @(3000), @1]
-                                     deallocator: nil
-                                           error: nil
-    ];
-
-    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
-
-    MLMultiArray * outMA = outCoreML.output;
-
-    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
-}
-
-#if __cplusplus
-}
-#endif
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,37 +14,6 @@ if (WHISPER_SUPPORT_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

-# common
-
-set(TARGET common)
-
-add_library(${TARGET} STATIC
-    common.h
-    common.cpp
-    )
-
-include(DefaultTargetOptions)
-
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-if (WHISPER_SUPPORT_SDL2)
-    # common-sdl
-
-    set(TARGET common-sdl)
-
-    add_library(${TARGET} STATIC
-        common-sdl.h
-        common-sdl.cpp
-        )
-
-    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
-
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@ -63,5 +32,4 @@ else()
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
-    add_subdirectory(talk.llama)
 endif()
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -23,7 +23,7 @@ string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
 target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================

-target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})

 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
    # Generate node.lib
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -1,13 +1,15 @@
-#include "napi.h"
-#include "common.h"
-
-#include "whisper.h"
-
+#include <cstdint>
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
-#include <cstdint>
+
+#include "napi.h"
+
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
+#include "whisper.h"

 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -42,7 +44,7 @@ struct whisper_params {
    std::string model    = "../../ggml-large.bin";

    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
+    std::vector<std::string> fname_outp = {};
 };

 struct whisper_print_user_data {
@ -72,7 +74,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -141,6 +143,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
 }

 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
+
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        return 2;
@ -178,14 +181,91 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+        const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];

        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
+        // WAV input
+        {
+            drwav wav;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+            if (fname_inp == "-") {
+                {
+                    uint8_t buf[1024];
+                    while (true)
+                    {
+                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                        if (n == 0) {
+                            break;
+                        }
+                        wav_data.insert(wav_data.end(), buf, buf + n);
+                    }
+                }
+
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
+                    return 4;
+                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+            }
+            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
+                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
+                return 5;
+            }
+
+           if (wav.channels != 1 && wav.channels != 2) {
+               fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
+               return 6;
+           }
+
+           if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+               fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
+               return 6;
+           }
+
+           if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
+               fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
+               return 8;
+           }
+
+           if (wav.bitsPerSample != 16) {
+               fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str());
+               return 9;
+           }
+
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+
+            std::vector<int16_t> pcm16;
+            pcm16.resize(n*wav.channels);
+            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+            drwav_uninit(&wav);
+
+            // convert to mono, float
+            pcmf32.resize(n);
+            if (wav.channels == 1) {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[i])/32768.0f;
+                }
+            } else {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+                }
+            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -260,7 +340,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -292,64 +372,51 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }

-class Worker : public Napi::AsyncWorker {
- public:
-  Worker(Napi::Function& callback, whisper_params params)
-      : Napi::AsyncWorker(callback), params(params) {}
-
-  void Execute() override {
-    run(params, result);
-  }
-
-  void OnOK() override {
-    Napi::HandleScope scope(Env());
-    Napi::Object res = Napi::Array::New(Env(), result.size());
-    for (uint64_t i = 0; i < result.size(); ++i) {
-      Napi::Object tmp = Napi::Array::New(Env(), 3);
-      for (uint64_t j = 0; j < 3; ++j) {
-        tmp[j] = Napi::String::New(Env(), result[i][j]);
-      }
-      res[i] = tmp;
+Napi::Object whisper(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+    if (info.Length() <= 0 || !info[0].IsObject()) {
+        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
    }
-    Callback().Call({Env().Null(), res});
-  }
+    whisper_params params;
+    std::vector<std::vector<std::string>> result;

- private:
-  whisper_params params;
-  std::vector<std::vector<std::string>> result;
-};
+    Napi::Object whisper_params = info[0].As<Napi::Object>();
+    std::string language = whisper_params.Get("language").As<Napi::String>();
+    std::string model = whisper_params.Get("model").As<Napi::String>();
+    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();

+    params.language = language;
+    params.model = model;
+    params.fname_inp.emplace_back(input);

+    // run model
+    run(params, result);

-Napi::Value whisper(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  if (info.Length() <= 0 || !info[0].IsObject()) {
-    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
-  }
-  whisper_params params;
+    fprintf(stderr, "RESULT:\n");
+    for (auto sentence:result) {
+        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
+                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
+    }

-  Napi::Object whisper_params = info[0].As<Napi::Object>();
-  std::string language = whisper_params.Get("language").As<Napi::String>();
-  std::string model = whisper_params.Get("model").As<Napi::String>();
-  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
+    Napi::Object res = Napi::Array::New(env, result.size());
+    for (uint64_t i = 0; i < result.size(); ++i) {
+        Napi::Object tmp = Napi::Array::New(env, 3);
+        for (uint64_t j = 0; j < 3; ++j) {
+            tmp[j] = Napi::String::New(env, result[i][j]);
+        }
+        res[i] = tmp;
+    }

-  params.language = language;
-  params.model = model;
-  params.fname_inp.emplace_back(input);
-
-  Napi::Function callback = info[1].As<Napi::Function>();
-  Worker* worker = new Worker(callback, params);
-  worker->Queue();
-  return env.Undefined();
+    return res;
 }


 Napi::Object Init(Napi::Env env, Napi::Object exports) {
-  exports.Set(
-      Napi::String::New(env, "whisper"),
-      Napi::Function::New(env, whisper)
-  );
-  return exports;
+    exports.Set(
+            Napi::String::New(env, "whisper"),
+            Napi::Function::New(env, whisper)
+    );
+    return exports;
 }

 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,36 +1,27 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
+const path = require('path');
+const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));

 const whisperParams = {
-  language: "en",
-  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-  fname_inp: "../../samples/jfk.wav",
+    language: 'en',
+    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
+    fname_inp: '',
 };

 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
-  arguments.reduce((pre, item) => {
-    if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
-    }
-    return pre;
-  }, [])
+    arguments.reduce((pre, item) => {
+        if (item.startsWith("--")) {
+            return [...pre, item.slice(2).split("=")];
+        }
+        return pre;
+    }, []),
 );

 for (const key in params) {
-  if (whisperParams.hasOwnProperty(key)) {
-    whisperParams[key] = params[key];
-  }
+    if (whisperParams.hasOwnProperty(key)) {
+        whisperParams[key] = params[key];
+    }
 }

-console.log("whisperParams =", whisperParams);
-
-whisperAsync(whisperParams).then((result) => {
-  console.log(`Result from whisper: ${result}`);
-});
+console.log('whisperParams =', whisperParams);
+console.log(whisper(whisperParams));
--- a/examples/chess/CMakeLists.txt
+++ b/examples/chess/CMakeLists.txt
@ -0,0 +1,10 @@
+if (WHISPER_SUPPORT_SDL2)
+    # chess
+    set(TARGET chess)
+    add_executable(${TARGET} chess.cpp)
+
+    include(DefaultTargetOptions)
+
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/chess/chess.cpp
+++ b/examples/chess/chess.cpp
@ -0,0 +1,634 @@
+// Input chess moves via voice
+//
+
+#include "common.h"
+#include "whisper.h"
+
+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <mutex>
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t step_ms    = 3000;
+    int32_t length_ms  = 10000;
+    int32_t keep_ms    = 200;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;
+
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
+    bool translate     = false;
+    bool print_special = false;
+    bool no_context    = true;
+    bool no_timestamps = false;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+    std::string fname_inp;
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (                 arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_inp     = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
+    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
+    fprintf(stderr, "            --keep N        [%-7d] audio to keep from previous step in ms\n",      params.keep_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                         params.fname_inp.c_str());
+    fprintf(stderr, "\n");
+}
+
+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+
+    m_running = false;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
+    params.length_ms = std::max(params.length_ms, params.step_ms);
+
+    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
+
+    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
+
+    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
+
+    params.no_timestamps  = !use_vad;
+    params.no_context    |= use_vad;
+    params.max_tokens     = 0;
+
+    // init audio
+
+    audio_async audio(params.length_ms);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    // whisper init
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+
+    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
+    std::vector<float> pcmf32_old;
+    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
+
+    std::vector<whisper_token> prompt_tokens;
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                n_samples_step,
+                float(n_samples_step)/WHISPER_SAMPLE_RATE,
+                float(n_samples_len )/WHISPER_SAMPLE_RATE,
+                float(n_samples_keep)/WHISPER_SAMPLE_RATE,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        if (!use_vad) {
+            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
+        } else {
+            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    int n_iter = 0;
+
+    bool is_running = true;
+
+    printf("[Start speaking]");
+    fflush(stdout);
+
+          auto t_last  = std::chrono::high_resolution_clock::now();
+    const auto t_start = t_last;
+
+    // main audio loop
+    while (is_running) {
+        // handle Ctrl + C
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }
+
+        if (!is_running) {
+            break;
+        }
+
+        // process new audio
+
+        if (!use_vad) {
+            while (true) {
+                audio.get(params.step_ms, pcmf32_new);
+
+                if ((int) pcmf32_new.size() > 2*n_samples_step) {
+                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
+                    audio.clear();
+                    continue;
+                }
+
+                if ((int) pcmf32_new.size() >= n_samples_step) {
+                    audio.clear();
+                    break;
+                }
+
+                SDL_Delay(1);
+            }
+
+            const int n_samples_new = pcmf32_new.size();
+
+            // take up to params.length_ms audio from previous iteration
+            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
+
+            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+
+            pcmf32.resize(n_samples_new + n_samples_take);
+
+            for (int i = 0; i < n_samples_take; i++) {
+                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+            }
+
+            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
+
+            pcmf32_old = pcmf32;
+        } else {
+            const auto t_now  = std::chrono::high_resolution_clock::now();
+            const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
+
+            if (t_diff < 2000) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            audio.get(2000, pcmf32_new);
+
+            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+                audio.get(params.length_ms, pcmf32);
+            } else {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            t_last = t_now;
+        }
+
+        // run the inference
+        {
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.print_progress   = false;
+            wparams.print_special    = params.print_special;
+            wparams.print_realtime   = false;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.translate        = params.translate;
+            wparams.no_context       = true;
+            wparams.single_segment   = !use_vad;
+            wparams.max_tokens       = params.max_tokens;
+            wparams.language         = params.language.c_str();
+            wparams.n_threads        = params.n_threads;
+
+            wparams.audio_ctx        = params.audio_ctx;
+
+            // disable temperature fallback
+            wparams.temperature_inc  = -1.0f;
+
+            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
+
+            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                return 6;
+            }
+
+            // print result;
+            {
+                if (!use_vad) {
+                    printf("\33[2K\r");
+
+                    // print long empty line to clear the previous line
+                    printf("%s", std::string(100, ' ').c_str());
+
+                    printf("\33[2K\r");
+                } else {
+                    const int64_t t1 = (t_last - t_start).count()/1000000;
+                    const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
+
+                    printf("\n");
+                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
+                    printf("\n");
+                }
+
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    if (params.no_timestamps) {
+                        printf("%s", text);
+                        fflush(stdout);
+                    } else {
+                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                    }
+                }
+
+                if (use_vad){
+                    printf("\n");
+                    printf("### Transcription %d END\n", n_iter);
+                }
+            }
+
+            ++n_iter;
+
+            if (!use_vad && (n_iter % n_new_line) == 0) {
+                printf("\n");
+
+                // keep part of the audio for next iteration to try to mitigate word boundary issues
+                pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
+
+                // Add tokens of the last full length segment as the prompt
+                if (!params.no_context) {
+                    prompt_tokens.clear();
+
+                    const int n_segments = whisper_full_n_segments(ctx);
+                    for (int i = 0; i < n_segments; ++i) {
+                        const int token_count = whisper_full_n_tokens(ctx, i);
+                        for (int j = 0; j < token_count; ++j) {
+                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    audio.pause();
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -11,7 +11,6 @@ add_executable(${TARGET}
 include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE
-    common
    whisper
    )

--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -1,5 +1,4 @@
 #include "ggml.h"
-#include "common.h"
 #include "whisper.h"

 #include <emscripten.h>
@ -28,6 +27,24 @@ std::string g_transcribed   = "";

 std::vector<float> g_pcmf32;

+static std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
@ -58,6 +75,44 @@ void command_set_status(const std::string & status) {
    g_status = status;
 }

+bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -100,7 +155,7 @@ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
    const int64_t n_samples = (ms * sample_rate) / 1000;

    int64_t n_take = 0;
-    if (n_samples > (int) g_pcmf32.size()) {
+    if (g_pcmf32.size() < n_samples) {
        n_take = g_pcmf32.size();
    } else {
        n_take = n_samples;
@ -132,6 +187,7 @@ void command_main(size_t index) {

    printf("command: using %d threads\n", wparams.n_threads);

+    bool is_running   = true;
    bool have_prompt  = false;
    bool ask_prompt   = true;
    bool print_energy = false;
@ -177,7 +233,7 @@ void command_main(size_t index) {
        {
            command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
+            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                command_set_status("Speech detected! Processing ...");

--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -5,5 +5,6 @@ if (WHISPER_SUPPORT_SDL2)

    include(DefaultTargetOptions)

-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -6,10 +6,11 @@
 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //

-#include "common.h"
-#include "common-sdl.h"
 #include "whisper.h"

+#include <SDL.h>
+#include <SDL_audio.h>
+
 #include <sstream>
 #include <cassert>
 #include <cstdio>
@ -109,6 +110,309 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -198,7 +502,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {

    std::string line;
    while (std::getline(ifs, line)) {
-        line = ::trim(line);
+        line = trim(line);
        if (line.empty()) {
            continue;
        }
@ -222,6 +526,23 @@ std::vector<std::string> get_words(const std::string &txt) {
    return words;
 }

+// returns true if no exit event was received
+bool process_sdl_events() {
+    SDL_Event event;
+    while (SDL_PollEvent(&event)) {
+        switch (event.type) {
+            case SDL_QUIT:
+                {
+                    return false;
+                } break;
+            default:
+                break;
+        }
+    }
+
+    return true;
+}
+
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
@ -313,14 +634,14 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        is_running = process_sdl_events();

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));

        audio.get(2000, pcmf32_cur);

-        if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+        if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
            fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

            const auto t_start = std::chrono::high_resolution_clock::now();
@ -454,7 +775,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        is_running = process_sdl_events();

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -470,7 +791,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
        {
            audio.get(2000, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                int64_t t_ms = 0;
@ -533,7 +854,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        is_running = process_sdl_events();

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -549,7 +870,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
        {
            audio.get(2000, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                int64_t t_ms = 0;
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -1,226 +0,0 @@
-#include "common-sdl.h"
-
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-
-    m_running = false;
-}
-
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-    }
-
-    m_sample_rate = capture_spec_obtained.freq;
-
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-
-    return true;
-}
-
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-
-    m_running = true;
-
-    return true;
-}
-
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-
-    m_running = false;
-
-    return true;
-}
-
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-
-    return true;
-}
-
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-
-    const size_t n_samples = len / sizeof(float);
-
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-
-    result.clear();
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
-        result.resize(n_samples);
-
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-
-bool sdl_poll_events() {
-    SDL_Event event;
-    while (SDL_PollEvent(&event)) {
-        switch (event.type) {
-            case SDL_QUIT:
-                {
-                    return false;
-                } break;
-            default:
-                break;
-        }
-    }
-
-    return true;
-}
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -1,50 +0,0 @@
-#pragma once
-
-#include <SDL.h>
-#include <SDL_audio.h>
-
-#include <atomic>
-#include <cstdint>
-#include <vector>
-#include <mutex>
-
-//
-// SDL Audio capture
-//
-
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-
-    bool init(int capture_id, int sample_rate);
-
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-
-    std::atomic_bool m_running;
-    std::mutex       m_mutex;
-
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-
-// Return false if need to quit
-bool sdl_poll_events();
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -1,162 +0,0 @@
-#include "common.h"
-
-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
-#include <cmath>
-#include <regex>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-
-bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname == "-") {
-        {
-            uint8_t buf[1024];
-            while (true)
-            {
-                const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                if (n == 0) {
-                    break;
-                }
-                wav_data.insert(wav_data.end(), buf, buf + n);
-            }
-        }
-
-        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from stdin\n");
-            return false;
-        }
-
-        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    }
-    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-        return false;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (stereo && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
-        return false;
-    }
-
-    if (wav.bitsPerSample != 16) {
-        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-        return false;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (stereo) {
-        // convert to stereo, float
-        pcmf32s.resize(2);
-
-        pcmf32s[0].resize(n);
-        pcmf32s[1].resize(n);
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-        }
-    }
-
-    return true;
-}
-
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-
-    float y = data[0];
-
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-
-    return true;
-}
--- a/examples/common.h
+++ b/examples/common.h
@ -1,40 +0,0 @@
-#pragma once
-
-// needs to match WHISPER_SAMPLE_RATE
-#define COMMON_SAMPLE_RATE 16000
-
-#include <vector>
-#include <string>
-
-std::string trim(const std::string & s);
-
-std::string replace(
-        const std::string & s,
-        const std::string & from,
-        const std::string & to);
-
-// Read WAV audio file and store the PCM data into pcmf32
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
-bool read_wav(
-        const std::string & fname,
-        std::vector<float> & pcmf32,
-        std::vector<std::vector<float>> & pcmf32s,
-        bool stereo);
-
-// Apply a high-pass frequency filter to PCM audio
-// Suppresses frequencies below cutoff Hz
-void high_pass_filter(
-        std::vector<float> & data,
-        float cutoff,
-        float sample_rate);
-
-// Basic voice activity detection (VAD) using audio energy adaptive threshold
-bool vad_simple(
-        std::vector<float> & pcmf32,
-        int   sample_rate,
-        int   last_ms,
-        float vad_thold,
-        float freq_thold,
-        bool  verbose);
-
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)

 include(DefaultTargetOptions)

-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,7 +31,6 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
-  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,7 +1,10 @@
-#include "common.h"
-
 #include "whisper.h"

+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
 #include <cmath>
 #include <fstream>
 #include <cstdio>
@ -73,7 +76,6 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
-    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -81,11 +83,10 @@ struct whisper_params {

    std::string language = "en";
    std::string prompt;
-    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
+    std::vector<std::string> fname_outp = {};
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -94,11 +95,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-"){
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
@ -129,10 +125,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
-        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
-        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
+        else if (arg == "-of"   || arg == "--output-file")    { params.fname_outp.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@ -178,9 +172,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -199,7 +191,7 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -358,157 +350,28 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    const int n_segments = whisper_full_n_segments(ctx);
-    fout << "start,end,text\n";
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
+        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
    }

    return true;
 }

-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "},\n");
-    };
-
-    auto start_obj = [&](const char *name = nullptr) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end = false) {
-        start_value(name);
-        fout << "\"" << val << (end ? "\"\n" : "\",\n");
-    };
-
-    auto end_value = [&](bool end = false) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end = false) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj();
-        value_s("systeminfo", whisper_print_system_info());
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx));
-            value_b("multilingual", whisper_is_multilingual(ctx));
-            value_i("vocab", whisper_model_n_vocab(ctx));
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx));
-                value_i("state", whisper_model_n_audio_state(ctx));
-                value_i("head", whisper_model_n_audio_head(ctx));
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj();
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx));
-                value_i("state", whisper_model_n_text_state(ctx));
-                value_i("head", whisper_model_n_text_head(ctx));
-                value_i("leyer", whisper_model_n_text_layer(ctx), true);
-            end_obj();
-            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
-        end_obj();
-        start_obj("params");
-            value_s("model", params.model.c_str());
-            value_s("language", params.language.c_str());
-            value_b("translate", params.translate, true);
-        end_obj();
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj();
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj();
-                    start_obj("timestanps");
-                        value_s("from", to_timestamp(t0, true).c_str());
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj();
-                    start_obj("offsets");
-                        value_i("from", t0 * 10);
-                        value_i("to", t1 * 10, true);
-                    end_obj();
-                    value_s("text", text, true);
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

-    static const char * font = params.font_path.c_str();
-
-    std::ifstream fin(font);
-    if (!fin.is_open()) {
-        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
-        return false;
-    }
+    // TODO: become parameter
+    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";

    fout << "#!/bin/bash" << "\n";
    fout << "\n";
@ -657,14 +520,91 @@ int main(int argc, char ** argv) {

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];

-        std::vector<float> pcmf32;               // mono-channel F32 PCM
+        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
+        // WAV input
+        {
+            drwav wav;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+            if (fname_inp == "-") {
+                {
+                    uint8_t buf[1024];
+                    while (true)
+                    {
+                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                        if (n == 0) {
+                            break;
+                        }
+                        wav_data.insert(wav_data.end(), buf, buf + n);
+                    }
+                }
+
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
+                    return 4;
+                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+            }
+            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
+                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
+                return 5;
+            }
+
+            if (wav.channels != 1 && wav.channels != 2) {
+                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
+                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
+                return 8;
+            }
+
+            if (wav.bitsPerSample != 16) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
+                return 9;
+            }
+
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+
+            std::vector<int16_t> pcm16;
+            pcm16.resize(n*wav.channels);
+            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+            drwav_uninit(&wav);
+
+            // convert to mono, float
+            pcmf32.resize(n);
+            if (wav.channels == 1) {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[i])/32768.0f;
+                }
+            } else {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+                }
+            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -742,7 +682,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -761,39 +701,34 @@ int main(int argc, char ** argv) {

            // output to text file
            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
+                const auto fname_txt = fname_outp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }

            // output to VTT file
            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
+                const auto fname_vtt = fname_outp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }

            // output to SRT file
            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
+                const auto fname_srt = fname_outp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }

            // output to WTS file
            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
+                const auto fname_wts = fname_outp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }

-            // output to CSV file
+	    // output to CSV file
            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
+                const auto fname_csv = fname_outp + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }

-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params);
-            }
        }
    }

--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -5,5 +5,6 @@ if (WHISPER_SUPPORT_SDL2)

    include(DefaultTargetOptions)

-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -3,16 +3,19 @@
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //

-#include "common.h"
-#include "common-sdl.h"
 #include "whisper.h"

+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
+#include <mutex>

 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -113,6 +116,306 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+
+    m_running = false;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
    whisper_params params;

@ -123,10 +426,10 @@ int main(int argc, char ** argv) {
    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
    params.length_ms = std::max(params.length_ms, params.step_ms);

-    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
+    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (params.keep_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (30000           *1e-3)*WHISPER_SAMPLE_RATE;

    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD

@ -214,7 +517,23 @@ int main(int argc, char ** argv) {
    // main audio loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }

        if (!is_running) {
            break;
@ -237,7 +556,7 @@ int main(int argc, char ** argv) {
                    break;
                }

-                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                SDL_Delay(1);
            }

            const int n_samples_new = pcmf32_new.size();
@ -268,7 +587,7 @@ int main(int argc, char ** argv) {

            audio.get(2000, pcmf32_new);

-            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                audio.get(params.length_ms, pcmf32);
            } else {
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -288,6 +607,7 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
+            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
--- a/examples/talk.llama/.gitignore
+++ b/examples/talk.llama/.gitignore
@ -1,2 +0,0 @@
-eleven-labs.py
-audio.mp3
--- a/examples/talk.llama/CMakeLists.txt
+++ b/examples/talk.llama/CMakeLists.txt
@ -1,12 +0,0 @@
-if (WHISPER_SUPPORT_SDL2)
-    # talk.llama
-    set(TARGET talk-llama)
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp)
-
-    include(DefaultTargetOptions)
-
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-endif ()
--- a/examples/talk.llama/README.md
+++ b/examples/talk.llama/README.md
@ -1,2 +0,0 @@
-# talk.llama
-
--- a/examples/talk.llama/llama.cpp
+++ b/examples/talk.llama/llama.cpp
--- a/examples/talk.llama/llama.h
+++ b/examples/talk.llama/llama.h
@ -1,153 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    ifdef _WIN32
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-
-        float p;     // probability of the token
-        float plog;  // log probability of the token
-
-    } llama_token_data;
-
-    typedef void (*llama_progress_callback)(double progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-                   int   itype,
-                   int   qk);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                     double   top_p,
-                     double   temp,
-                     double   repeat_penalty);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/examples/talk.llama/speak.sh
+++ b/examples/talk.llama/speak.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# for Mac
-say "$2"
-
-# Eleven Labs
-#
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2" >/dev/null 2>&1
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk.llama/talk-llama.cpp
+++ b/examples/talk.llama/talk-llama.cpp
@ -1,511 +0,0 @@
-// Talk with AI
-//
-
-#include "common.h"
-#include "common-sdl.h"
-#include "whisper.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
-    int32_t audio_ctx  = 0;
-
-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-
-    std::string person      = "Santa";
-    std::string language    = "en";
-    std::string model_wsp   = "models/ggml-base.en.bin";
-    std::string model_llama = "models/ggml-llama-7B.bin";
-    std::string speak       = "./examples/talk/speak.sh";
-    std::string fname_out;
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -mg FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = params.n_threads;
-
-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-// need to have leading ' '
-//const std::string k_prompt = R"( Transcript of a dialog, where {1} interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer {1}'s requests immediately and with precision.
-//
-//{0}: Hello, Bob.
-//{1}: Hello {0}. How may I help you today?
-//{0}:)";
-
-const std::string k_prompt = R"( Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{1} answers responds with short and concise answers.
-
-{0}{4} Hello, {1}!
-{1}{4} Hello {0}! How may I help you today?
-{0}{4} What time is it?
-{1}{4} It is {2} o'clock.
-{0}{4} What year is it?
-{1}{4} We are in {3}.
-{0}{4} What is a cat?
-{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-{0}{4} Name a color.
-{1}{4} Blue
-{0}{4})";
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
-
-    // llama init
-
-    auto lparams = llama_context_default_params();
-
-    lparams.n_ctx      = 512;
-    lparams.n_parts    = 2; // TODO fix
-    lparams.seed       = 1; // TODO fix
-    lparams.f16_kv     = true;
-
-    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = false;
-
-    float prob0 = 0.0f;
-
-    const std::string chat_symb = ":";
-    const std::string bot_name = "LLAMA";
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    std::string prompt_org = k_prompt;
-    prompt_org = ::replace(prompt_org, "{0}", params.person);
-    prompt_org = ::replace(prompt_org, "{1}", bot_name);
-
-    {
-        // get time string
-        std::string time_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%H:%M", now);
-            time_str = buf;
-        }
-        prompt_org = ::replace(prompt_org, "{2}", time_str);
-    }
-
-    {
-        // get year string
-        std::string year_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%Y", now);
-            year_str = buf;
-        }
-        prompt_org = ::replace(prompt_org, "{3}", year_str);
-    }
-
-    prompt_org = ::replace(prompt_org, "{4}", chat_symb);
-
-    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_org, true);
-
-    const int n_ctx = llama_n_ctx(ctx_llama);
-
-    printf("\n");
-    printf("%s : initializing - please wait ...\n", __func__);
-
-    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return 1;
-    }
-
-    //fprintf(stdout, "\n");
-    //fprintf(stdout, "%s", prompt_org.c_str());
-    //fflush(stdout);
-
-    printf("%s : done! start speaking in the microphone\n", __func__);
-    printf("\n");
-    printf("%s%s", params.person.c_str(), chat_symb.c_str());
-    fflush(stdout);
-
-    audio.clear();
-
-    const int n_keep = embd_inp.size();
-    const int voice_id = 2;
-
-    int n_past = n_keep;
-    int n_prev = 64; // TODO arg
-
-    std::vector<llama_token> embd;
-
-    std::vector<std::string> antiprompts = {
-        params.person + chat_symb,
-    };
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        is_running = sdl_poll_events();
-
-        if (!is_running) {
-            break;
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(2000, pcmf32_cur);
-
-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard;
-
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
-
-                if (text_heard.empty() || tokens.empty() || force_speak) {
-                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                force_speak = false;
-
-                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + bot_name + chat_symb;
-                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
-                fflush(stdout);
-
-                embd = ::llama_tokenize(ctx_llama, text_heard, false);
-
-                // text inference
-                bool done = false;
-                std::string text_to_speak;
-                while (true) {
-                    // predict
-                    if (embd.size() > 0) {
-                        if (n_past + (int) embd.size() > n_ctx) {
-                            n_past = n_keep;
-
-                            // insert n_left/2 tokens at the start of embd from last_n_tokens
-                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
-                            //printf("\n---\n");
-                            //printf("resetting: '");
-                            //for (int i = 0; i < (int) embd.size(); i++) {
-                            //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
-                            //}
-                            //printf("'\n");
-                            //printf("\n---\n");
-                        }
-
-                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
-                            fprintf(stderr, "%s : failed to eval\n", __func__);
-                            return 1;
-                        }
-                    }
-
-                    //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
-
-                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
-                    n_past += embd.size();
-                    embd.clear();
-
-                    if (done) break;
-
-                    {
-                        // out of user input, sample next token
-                        const float top_k          = 5;
-                        const float top_p          = 0.80f;
-                        const float temp           = 0.30f;
-                        const float repeat_penalty = 1.1764f;
-
-                        const int repeat_last_n    = 256;
-
-                        llama_token id = 0;
-
-                        {
-                            //auto logits = llama_get_logits(ctx_llama);
-                            //logits[llama_token_eos()] = 0;
-
-                            id = llama_sample_top_p_top_k(ctx_llama,
-                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
-                        }
-
-                        if (id != llama_token_eos()) {
-                            // add it to the context
-                            embd.push_back(id);
-
-                            text_to_speak += llama_token_to_str(ctx_llama, id);
-
-                            printf("%s", llama_token_to_str(ctx_llama, id));
-                        } else {
-                            // TODO
-                            printf("EOS TOKEN - SHOULD NOT HAPPEN\n");
-                            exit(0);
-                        }
-                    }
-
-                    {
-                        std::string last_output;
-                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-                            last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
-                        }
-                        last_output += llama_token_to_str(ctx_llama, embd[0]);
-
-                        for (std::string & antiprompt : antiprompts) {
-                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                                done = true;
-                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
-                                fflush(stdout);
-                                break;
-                            }
-                        }
-                    }
-
-                    is_running = sdl_poll_events();
-
-                    if (!is_running) {
-                        break;
-                    }
-                }
-
-                text_to_speak = ::replace(text_to_speak, "\"", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    return 0;
-}
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -7,7 +7,7 @@ if (WHISPER_SUPPORT_SDL2)

    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)

    include(DefaultTargetOptions)

--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -1,14 +1,16 @@
 // Talk with AI
 //

-#include "common.h"
-#include "common-sdl.h"
 #include "whisper.h"
 #include "gpt-2.h"

+#include <SDL.h>
+#include <SDL_audio.h>
+
 #include <cassert>
 #include <cstdio>
 #include <fstream>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@ -103,6 +105,320 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+        fprintf(stderr, "\n");
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+std::string replace(const std::string & s, const std::string & from, const std::string & to) {
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -241,10 +557,22 @@ int main(int argc, char ** argv) {
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }

-        if (!is_running) {
-            break;
+            if (!is_running) {
+                break;
+            }
        }

        // delay
@ -255,7 +583,7 @@ int main(int argc, char ** argv) {
        {
            audio.get(2000, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                audio.get(params.voice_ms, pcmf32_cur);
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.

-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,7 +2,6 @@ package com.whispercppdemo.ui.main

 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
-import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -20,7 +19,6 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
-        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -32,7 +30,6 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
-    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -48,11 +45,8 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Column(verticalArrangement = Arrangement.SpaceBetween) {
-                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
-                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
-                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
-                }
+            Row(horizontalArrangement = Arrangement.SpaceBetween) {
+                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -66,16 +60,7 @@ private fun MainScreen(

@Composable
 private fun MessageLog(log: String) {
-    SelectionContainer() {
-        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
-    }
-}
-
-@Composable
-private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
-    Button(onClick = onClick, enabled = enabled) {
-        Text("Benchmark")
-    }
+    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
 }

@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,15 +41,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {

    init {
        viewModelScope.launch {
-            printSystemInfo()
            loadData()
        }
    }

-    private suspend fun printSystemInfo() {
-        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
-    }
-
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -86,29 +81,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }

-    fun benchmark() = viewModelScope.launch {
-        runBenchmark(6)
-    }
-
    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }

-    private suspend fun runBenchmark(nthreads: Int) {
-        if (!canTranscribe) {
-            return
-        }
-
-        canTranscribe = false
-
-        printMessage("Running benchmark. This will take minutes...\n")
-        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
-        printMessage("\n")
-        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
-
-        canTranscribe = true
-    }
-
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -138,14 +114,11 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false

        try {
-            printMessage("Reading wave samples... ")
+            printMessage("Reading wave samples...\n")
            val data = readAudioSamples(file)
-            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
-            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            val elapsed = System.currentTimeMillis() - start
-            printMessage("Done ($elapsed ms): $text\n")
+            printMessage("Done: $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -27,14 +27,6 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }

-    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchMemcpy(nthreads)
-    }
-
-    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
-    }
-
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -74,10 +66,6 @@ class WhisperContext private constructor(private var ptr: Long) {
            }
            return WhisperContext(ptr)
        }
-
-        fun getSystemInfo(): String {
-            return WhisperLib.getSystemInfo()
-        }
    }
 }

@ -86,7 +74,6 @@ private class WhisperLib {
        init {
            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
            var loadVfpv4 = false
-            var loadV8fp16 = false
            if (isArmEabiV7a()) {
                // armeabi-v7a needs runtime detection support
                val cpuInfo = cpuInfo()
@ -97,24 +84,11 @@ private class WhisperLib {
                        loadVfpv4 = true
                    }
                }
-            } else if (isArmEabiV8a()) {
-                // ARMv8.2a needs runtime detection support
-                val cpuInfo = cpuInfo()
-                cpuInfo?.let {
-                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
-                    if (cpuInfo.contains("fphp")) {
-                        Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
-                        loadV8fp16 = true
-                    }
-                }
            }

            if (loadVfpv4) {
                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
                System.loadLibrary("whisper_vfpv4")
-            } else if (loadV8fp16) {
-                Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
-                System.loadLibrary("whisper_v8fp16_va")
            } else {
                Log.d(LOG_TAG, "Loading libwhisper.so")
                System.loadLibrary("whisper")
@ -129,9 +103,6 @@ private class WhisperLib {
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
-        external fun getSystemInfo(): String
-        external fun benchMemcpy(nthread: Int): String
-        external fun benchGgmlMulMat(nthread: Int): String
    }
 }

@ -139,10 +110,6 @@ private fun isArmEabiV7a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
 }

-private fun isArmEabiV8a(): Boolean {
-    return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
-}
-
 private fun cpuInfo(): String? {
    return try {
        File("/proc/cpuinfo").inputStream().bufferedReader().use {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -12,15 +12,4 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
-endif
-
-ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-	include $(CLEAR_VARS)
-	LOCAL_MODULE    := libwhisper_v8fp16_va
-	include $(LOCAL_PATH)/Whisper.mk
-	# Allow building NEON FMA code.
-	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
-	LOCAL_CFLAGS += -march=armv8.2-a+fp16
-	include $(BUILD_SHARED_LIBRARY)
-endif
-
+endif
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -6,7 +6,6 @@
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
-#include "ggml.h"

 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
@ -214,30 +213,4 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    const char *text = whisper_full_get_segment_text(context, index);
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
-        JNIEnv *env, jobject thiz
-) {
-    UNUSED(thiz);
-    const char *sysinfo = whisper_print_system_info();
-    jstring string = (*env)->NewStringUTF(env, sysinfo);
-    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
-                                                                      jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
-                                                                          jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
-}
+}
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
-
-In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,10 +296,6 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -7,9 +7,8 @@ To use:
 2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
 4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
-5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
+5. Select the "release" build configuration under "Run", then deploy and run to your device.

 [^1]: I recommend the tiny, base or small models for running on an iOS device.
-[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,10 +430,6 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/examples/yt-wsp.sh
+++ b/examples/yt-wsp.sh
@ -67,6 +67,23 @@ msg() {
    echo >&2 -e "${1-}"
 }

+################################################################################
+# create a temporary directory to work in
+# set the temp_dir and temp_filename variables
+################################################################################
+temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
+temp_filename="${temp_dir}/yt-dlp-filename";
+
+################################################################################
+# for now we only take one argument
+# TODO: a for loop
+################################################################################
+source_url="${1}"
+
+
+title_name="";
+
+
 cleanup() {
    local -r clean_me="${1}";

@ -128,20 +145,6 @@ fi

 check_requirements;

-################################################################################
-# create a temporary directory to work in
-# set the temp_dir and temp_filename variables
-################################################################################
-temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
-temp_filename="${temp_dir}/yt-dlp-filename";
-
-################################################################################
-# for now we only take one argument
-# TODO: a for loop
-################################################################################
-source_url="${1}"
-title_name="";
-
 msg "Downloading VOD...";

 ################################################################################
@ -196,6 +199,6 @@ ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
    -c:s mov_text \
    -y "${title_name}-res.mp4";

-#cleanup "${temp_dir}";
+cleanup "${temp_dir}";

 msg "Done! Your finished file is ready: ${title_name}-res.mp4";
--- a/extra/bench-wts.sh
+++ b/extra/bench-wts.sh
@ -1,70 +0,0 @@
-# Benchmark word-level timestamps for different models
-#
-# This script takes two arguments
-# - an audio file
-# - [optional] path to a font file
-
-# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
-
-if [ -z "$1" ]; then
-    echo "Usage: $0 <audio file> [font file]"
-    exit 1
-fi
-
-#TODO: Make this a command line parameter
-#models="base small large"
-#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
-models="tiny.en base.en small.en medium.en large"
-
-DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
-DURATION=$(printf "%.2f" $DURATION)
-echo "Input file duration: ${DURATION}s"
-
-for model in $models; do
-    echo "Running $model"
-    COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
-
-    if [ ! -z "$2" ]; then
-        COMMAND="$COMMAND -fp $2"
-    fi
-    #TODO: Surface errors better
-    # TIMEFMT is for zsh, TIMEFORMAT is for bash
-    EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
-
-    # Slightly different formats between zsh and bash
-    if [ "${EXECTIME: -1}" == "s" ]; then
-        EXECTIME=${EXECTIME::-1}
-    fi
-
-    RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
-    RATIO=$(printf "%.2f" $RATIO)
-
-    echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
-
-    # If the file already exists, delete it
-    if [ -f $1.mp4 ]; then
-        rm $1.mp4
-    fi
-
-    bash $1.$model.wts >/dev/null 2>&1
-    mv $1.mp4 $1.$model.mp4
-
-    ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
-done
-
-COMMAND="ffmpeg -y"
-for model in $models; do
-    COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
-done
-COMMAND="$COMMAND -filter_complex \""
-COUNT=0
-for model in $models; do
-    COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
-    COUNT=$((COUNT+2))
-done
-COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
-
-echo $COMMAND
-
-# Run the command
-eval $COMMAND
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,8 +198,6 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -228,9 +226,7 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
-    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
-    GGML_OP_RMS_NORM,

    GGML_OP_MUL_MAT,

@ -330,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);

-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-
+size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -343,9 +336,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

-bool ggml_mlock_supported(void);
-bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -476,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@ -744,13 +726,6 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);

-//
-// quantization
-//
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-
 //
 // system info
 //
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:

 A third option to obtain the model files is to download them from Hugging Face:

-https://huggingface.co/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main

 ## Available models

--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]

-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
+with open(dir_model + "/vocab.json", "r") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
+with open(dir_model + "/added_tokens.json", "r") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r", encoding="utf8") as f:
+with open(dir_model + "/config.json", "r") as f:
    hparams = json.load(f)

 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -1,82 +0,0 @@
-#!/bin/bash
-
-# This script downloads Whisper model files that have already been converted to Core ML format.
-# This way you don't have to convert them yourself.
-
-src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
-pfx="resolve/main/ggml"
-
-# get the path of this script
-function get_script_path() {
-    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname $(realpath $0))"
-    else
-        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
-        echo "$ret"
-    fi
-}
-
-models_path="$(get_script_path)"
-
-# Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download Core ML model
-
-printf "Downloading Core ML model $model from '$src' ...\n"
-
-cd $models_path
-
-if [ -f "ggml-$model.mlmodel" ]; then
-    printf "Model $model already exists. Skipping download.\n"
-    exit 0
-fi
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download Core ML model $model \n"
-    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
-printf "Run the following command to compile it:\n\n"
-printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
-printf "\n"
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )

-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"

 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,7 +6,7 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"

-src="https://huggingface.co/ggerganov/whisper.cpp"
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"

 # get the path of this script
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -66,7 +66,6 @@ extern "C" {
    //

    struct whisper_context;
-    struct whisper_state;

    typedef int whisper_token;

@ -102,20 +101,11 @@ extern "C" {
    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);

-    // These are the same as the above, but the internal state of the context is not allocated automatically
-    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
-
-    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
-
-    // Frees all allocated memory
-    WHISPER_API void whisper_free      (struct whisper_context * ctx);
-    WHISPER_API void whisper_free_state(struct whisper_state * state);
+    // Frees all memory allocated by the model.
+    WHISPER_API void whisper_free(struct whisper_context * ctx);

    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
@ -123,30 +113,17 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

-    WHISPER_API int whisper_pcm_to_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
+    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
+        struct whisper_context* ctx,
+        const float* samples,
+        int   n_samples,
+        int   n_threads);

-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);

-    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
+    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
    // Returns 0 on success
@ -156,14 +133,7 @@ extern "C" {
                               int   n_len,
                               int   n_mel);

-    WHISPER_API int whisper_set_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
+    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
    // offset can be used to specify the offset of the first frame in the spectrogram.
    // Returns 0 on success
@ -172,12 +142,6 @@ extern "C" {
                               int   offset,
                               int   n_threads);

-    WHISPER_API int whisper_encode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset,
-                               int   n_threads);
-
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
    // tokens + n_tokens is the provided context for the decoder.
@ -191,14 +155,6 @@ extern "C" {
                               int   n_past,
                               int   n_threads);

-    WHISPER_API int whisper_decode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -234,44 +190,20 @@ extern "C" {
                               int   n_threads,
                             float * lang_probs);

-    WHISPER_API int whisper_lang_auto_detect_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
-    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
-    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
-
-    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
+    WHISPER_API int whisper_n_vocab        (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_audio_ctx    (struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);

    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
-    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
+    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
-    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
-

    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -286,7 +218,7 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_translate (void);
    WHISPER_API whisper_token whisper_token_transcribe(void);

-    // Performance information from the default state.
+    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

@ -304,23 +236,12 @@ extern "C" {
    // Text segment callback
    // Called on every newly generated text segment
    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
+    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);

    // Encoder begin callback
    // If not NULL, called before the encoder starts
    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
-
-    // Logits filter callback
-    // Can be used to modify the logits before sampling
-    // If not NULL, called after applying temperature to logits
-    typedef void (*whisper_logits_filter_callback)(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-          const whisper_token_data * tokens,
-                               int   n_tokens,
-                             float * logits,
-                              void * user_data);
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);

    // Parameters for the whisper_full() function
    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
@ -394,16 +315,11 @@ extern "C" {
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
-
-        // called by each decoder to filter obtained logits
-        whisper_logits_filter_callback logits_filter_callback;
-        void * logits_filter_callback_user_data;
    };

    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
-    // Not thread safe for same context
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
                struct whisper_context * ctx,
@ -411,16 +327,7 @@ extern "C" {
                           const float * samples,
                                   int   n_samples);

-    WHISPER_API int whisper_full_with_state(
-                struct whisper_context * ctx,
-                  struct whisper_state * state,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
-    // Result is stored in the default state of the context
-    // Not thread safe if executed in parallel on the same context.
+    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
@ -430,56 +337,40 @@ extern "C" {
                                   int   n_samples,
                                   int   n_processors);

-    // Number of generated text segments
+    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
-    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
+    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);

-    // Language id associated with the context's default state
+    // Language id associated with the current context
    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);

-    // Language id associated with the provided state
-    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
+    // Get the start and end time of the specified segment.
+    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);

-    // Get the start and end time of the specified segment
-    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
+    // Get the text of the specified segment.
+    WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);

-    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
+    // Get number of tokens in the specified segment.
+    WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);

-    // Get the text of the specified segment
-    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
+    // Get the token text of the specified token in the specified segment.
+    WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);

-    // Get number of tokens in the specified segment
-    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
-
-    // Get the token text of the specified token in the specified segment
-    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
-
-    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get token data for the specified token in the specified segment
+    // Get token data for the specified token in the specified segment.
    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);

-    // Get the probability of the specified token in the specified segment
-    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
+    // Get the probability of the specified token in the specified segment.
+    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

    ////////////////////////////////////////////////////////////////////////////

    // Temporary helpers needed for exposing ggml interface

    WHISPER_API int whisper_bench_memcpy(int n_threads);
-    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
-    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);

 #ifdef __cplusplus
 }