command : wip in progress, improve guided decoding

2025-06-25 09:31:44 +00:00 · 2023-02-19 19:39:05 +02:00
50 changed files with 1155 additions and 6914 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,5 @@
 *.o
 *.a
-*.mlmodel
-*.mlmodelc
 .cache/
 .vs/
 .vscode/
@ -12,7 +10,6 @@ build-em/
 build-debug/
 build-release/
 build-static/
-build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

@ -34,5 +31,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt
-
-*.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.2.1)
+project(whisper.cpp VERSION 1.2.0)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -54,8 +54,6 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
-
-    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -88,33 +86,16 @@ endif()

 find_package(Threads REQUIRED)

-# on APPLE
-if (APPLE)
-    # include Accelerate framework
-    if (NOT WHISPER_NO_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
+# on APPLE - include Accelerate framework
+if (APPLE AND NOT WHISPER_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")

-        if (ACCELERATE_FRAMEWORK)
-            message(STATUS "Accelerate framework found")
-
-            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-        else()
-            message(WARNING "Accelerate framework not found")
-        endif()
-    endif()
-
-    if (WHISPER_COREML)
-        find_library(FOUNDATION_FRAMEWORK Foundation)
-        find_library(COREML_FRAMEWORK CoreML)
-
-        if (COREML_FRAMEWORK)
-            message(STATUS "CoreML framework found")
-
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
-        else()
-            message(WARNING "CoreML framework not found")
-        endif()
+        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
    endif()
 endif()

@ -191,9 +172,7 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
-            if(NOT WHISPER_NO_F16C)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
        endif()
    endif()
 endif()
@ -202,33 +181,6 @@ if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()

-#
-# whisper.coreml - Core ML support
-#
-
-if (WHISPER_COREML)
-    set(TARGET whisper.coreml)
-
-    add_library(${TARGET}
-        coreml/whisper-encoder.h
-        coreml/whisper-encoder.mm
-        coreml/whisper-encoder-impl.h
-        coreml/whisper-encoder-impl.m
-        )
-
-    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PUBLIC
-        .
-        )
-
-    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
-
-    set_target_properties(${TARGET} PROPERTIES
-        COMPILE_FLAGS "-fobjc-arc"
-        )
-endif()
-
 #
 # whisper - this is the main library of the project
 #
@ -248,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
    .
    )

-if (WHISPER_COREML)
-    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
-endif()
-
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

--- a/56
+++ b/56
@ -30,16 +30,10 @@ endif
 # Compile flags
 #

-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =

-# ref: https://github.com/ggerganov/whisper.cpp/issues/37
-ifneq ($(wildcard /usr/include/musl/*),)
-	CFLAGS   += -D_POSIX_SOURCE -D_GNU_SOURCE
-	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-endif
-
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -138,10 +132,6 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
-ifdef WHISPER_COREML
-	CXXFLAGS += -DWHISPER_USE_COREML
-	LDFLAGS  += -framework Foundation -framework CoreML
-endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -151,8 +141,6 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
-	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
@ -194,23 +182,11 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o

-ifndef WHISPER_COREML
-WHISPER_OBJ = whisper.o
-else
-whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
-	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
+libwhisper.a: ggml.o whisper.o
+	$(AR) rcs libwhisper.a ggml.o whisper.o

-whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
-	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
-
-WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
-endif
-
-libwhisper.a: ggml.o $(WHISPER_OBJ)
-	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
-
-libwhisper.so: ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
+libwhisper.so: ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
 	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -224,21 +200,21 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp

-main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h

-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)

-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)

-bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
+bench: examples/bench/bench.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -433,19 +433,6 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 ---

-## Video comparison of different models
-
-Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
-
-```java
-./extra/bench-wts.sh samples/jfk.wav
-ffplay ./samples/jfk.wav.all.mp4
-```
-
-https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
-
---
-
 ## Benchmarks

 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -466,7 +453,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,17 +463,13 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
-  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
 - [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
-  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
-  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)

 ## Examples

--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,9 +17,9 @@ import (
 // CONSTANTS

 const (
-	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                      // Filename extension
-	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
+	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcExt  = ".bin"                                                               // Filename extension
+	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
 )

 var (
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -94,7 +94,6 @@ func (model *model) NewContext() (Context, error) {
 	params.SetPrintRealtime(false)
 	params.SetPrintTimestamps(false)
 	params.SetThreads(runtime.NumCPU())
-	params.SetNoContext(true)

 	// Return new context
 	return newContext(model, params)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -20,7 +20,7 @@ extern bool callEncoderBegin(void* user_data);
 // Text segment callback
 // Called on every newly generated text segment
 // Use the whisper_full_...() functions to obtain the text segments
-static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
+static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        callNewSegment(user_data, n_new);
    }
@ -29,7 +29,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_s
 // Encoder begin callback
 // If not NULL, called before the encoder starts
 // If it returns false, the computation is aborted
-static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        return callEncoderBegin(user_data);
    }
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.1",
+  "version": "1.2.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -199,7 +199,7 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  {
    static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
      bool is_aborted = *(bool*)user_data;
      return !is_aborted;
    };
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -1,142 +0,0 @@
-//
-// CoremlEncoder.h
-//
-// This file was automatically generated and should not be edited.
-//
-
-#import <Foundation/Foundation.h>
-#import <CoreML/CoreML.h>
-#include <stdint.h>
-#include <os/log.h>
-
-NS_ASSUME_NONNULL_BEGIN
-
-
-/// Model Prediction Input Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
-
-/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
-
-@end
-
-
-/// Model Prediction Output Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
-
-/// output as multidimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * output;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
-
-@end
-
-
-/// Class for model loading and prediction
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoder : NSObject
-@property (readonly, nonatomic, nullable) MLModel * model;
-
-/**
-    URL of the underlying .mlmodelc directory.
-*/
-+ (nullable NSURL *)URLOfModelInThisBundle;
-
-/**
-    Initialize CoremlEncoder instance from an existing MLModel object.
-
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
-    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
-*/
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-*/
- (nullable instancetype)init;
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Construct CoremlEncoder instance asynchronously with configuration.
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
-
-/**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
-
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param modelURL The model URL.
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
-
-/**
-    Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
-    @param options prediction options
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Make a prediction using the convenience interface
-    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
-*/
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-
-/**
-    Batch prediction
-    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
-    @param options prediction options
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the predictions as NSArray<CoremlEncoderOutput *>
-*/
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
-@end
-
-NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -1,197 +0,0 @@
-//
-// CoremlEncoder.m
-//
-// This file was automatically generated and should not be edited.
-//
-
-#if !__has_feature(objc_arc)
-#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
-#endif
-
-#import "whisper-encoder-impl.h"
-
-@implementation CoremlEncoderInput
-
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
-    self = [super init];
-    if (self) {
-        _melSegment = melSegment;
-    }
-    return self;
-}
-
- (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"melSegment"]];
-}
-
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"melSegment"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
-    }
-    return nil;
-}
-
-@end
-
-@implementation CoremlEncoderOutput
-
- (instancetype)initWithOutput:(MLMultiArray *)output {
-    self = [super init];
-    if (self) {
-        _output = output;
-    }
-    return self;
-}
-
- (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"output"]];
-}
-
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"output"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.output];
-    }
-    return nil;
-}
-
-@end
-
-@implementation CoremlEncoder
-
-
-/**
-    URL of the underlying .mlmodelc directory.
-*/
-+ (nullable NSURL *)URLOfModelInThisBundle {
-    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
-    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
-    return [NSURL fileURLWithPath:assetPath];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from an existing MLModel object.
-
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
-    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
-*/
- (instancetype)initWithMLModel:(MLModel *)model {
-    self = [super init];
-    if (!self) { return nil; }
-    _model = model;
-    if (_model == nil) { return nil; }
-    return self;
-}
-
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-*/
- (nullable instancetype)init {
-    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
-}
-
-
-/**
-    Initialize CoremlEncoder instance with the model in this bundle.
-
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
-    if (model == nil) { return nil; }
-    return [self initWithMLModel:model];
-}
-
-
-/**
-    Initialize CoremlEncoder instance from the model URL.
-
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
-    @param configuration The model configuration object
-    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
-    if (model == nil) { return nil; }
-    return [self initWithMLModel:model];
-}
-
-
-/**
-    Construct CoremlEncoder instance asynchronously with configuration.
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
-    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
-              configuration:configuration
-          completionHandler:handler];
-}
-
-
-/**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
-
-    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
-
-    @param modelURL The model URL.
-    @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
-*/
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
-    [MLModel loadContentsOfURL:modelURL
-                 configuration:configuration
-             completionHandler:^(MLModel *model, NSError *error) {
-        if (model != nil) {
-            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
-            handler(typedModel, nil);
-        } else {
-            handler(nil, error);
-        }
-    }];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
-    if (!outFeatures) { return nil; }
-    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
-}
-
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
-    return [self predictionFromFeatures:input_ error:error];
-}
-
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
-    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
-    if (!outBatch) { return nil; }
-    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
-    for (NSInteger i = 0; i < outBatch.count; i++) {
-        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
-        [results addObject:result];
-    }
-    return results;
-}
-
-@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -1,22 +0,0 @@
-// Wrapper of the Core ML Whisper Encoder model
-//
-// Code is derived from the work of Github user @wangchou
-// ref: https://github.com/wangchou/callCoreMLFromCpp
-
-#if __cplusplus
-extern "C" {
-#endif
-
-struct whisper_coreml_context;
-
-struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
-void whisper_coreml_free(struct whisper_coreml_context * ctx);
-
-void whisper_coreml_encode(
-        const whisper_coreml_context * ctx,
-                               float * mel,
-                               float * out);
-
-#if __cplusplus
-}
-#endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -1,61 +0,0 @@
-#import "coreml/whisper-encoder.h"
-#import "coreml/whisper-encoder-impl.h"
-
-#import <CoreML/CoreML.h>
-
-#include <stdlib.h>
-
-#if __cplusplus
-extern "C" {
-#endif
-
-struct whisper_coreml_context {
-    const void * data;
-};
-
-struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
-    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
-
-    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
-
-    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
-
-    if (data == NULL) {
-        return NULL;
-    }
-
-    whisper_coreml_context * ctx = new whisper_coreml_context;
-
-    ctx->data = data;
-
-    return ctx;
-}
-
-void whisper_coreml_free(struct whisper_coreml_context * ctx) {
-    CFRelease(ctx->data);
-    delete ctx;
-}
-
-void whisper_coreml_encode(
-        const whisper_coreml_context * ctx,
-                               float * mel,
-                               float * out) {
-    MLMultiArray * inMultiArray = [
-        [MLMultiArray alloc] initWithDataPointer: mel
-                                           shape: @[@1, @80, @3000]
-                                        dataType: MLMultiArrayDataTypeFloat32
-                                         strides: @[@(240000), @(3000), @1]
-                                     deallocator: nil
-                                           error: nil
-    ];
-
-    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
-
-    MLMultiArray * outMA = outCoreML.output;
-
-    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
-}
-
-#if __cplusplus
-}
-#endif
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -63,5 +63,4 @@ else()
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
-    add_subdirectory(talk.llama)
 endif()
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -72,7 +72,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
+void whisper_print_segment(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -250,7 +250,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback           = whisper_print_segment;
                wparams.new_segment_callback_user_data = &user_data;
            }

@ -260,7 +260,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -292,64 +292,51 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }

-class Worker : public Napi::AsyncWorker {
- public:
-  Worker(Napi::Function& callback, whisper_params params)
-      : Napi::AsyncWorker(callback), params(params) {}
-
-  void Execute() override {
-    run(params, result);
-  }
-
-  void OnOK() override {
-    Napi::HandleScope scope(Env());
-    Napi::Object res = Napi::Array::New(Env(), result.size());
-    for (uint64_t i = 0; i < result.size(); ++i) {
-      Napi::Object tmp = Napi::Array::New(Env(), 3);
-      for (uint64_t j = 0; j < 3; ++j) {
-        tmp[j] = Napi::String::New(Env(), result[i][j]);
-      }
-      res[i] = tmp;
+Napi::Object whisper(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+    if (info.Length() <= 0 || !info[0].IsObject()) {
+        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
    }
-    Callback().Call({Env().Null(), res});
-  }
+    whisper_params params;
+    std::vector<std::vector<std::string>> result;

- private:
-  whisper_params params;
-  std::vector<std::vector<std::string>> result;
-};
+    Napi::Object whisper_params = info[0].As<Napi::Object>();
+    std::string language = whisper_params.Get("language").As<Napi::String>();
+    std::string model = whisper_params.Get("model").As<Napi::String>();
+    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();

+    params.language = language;
+    params.model = model;
+    params.fname_inp.emplace_back(input);

+    // run model
+    run(params, result);

-Napi::Value whisper(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  if (info.Length() <= 0 || !info[0].IsObject()) {
-    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
-  }
-  whisper_params params;
+    fprintf(stderr, "RESULT:\n");
+    for (auto sentence:result) {
+        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
+                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
+    }

-  Napi::Object whisper_params = info[0].As<Napi::Object>();
-  std::string language = whisper_params.Get("language").As<Napi::String>();
-  std::string model = whisper_params.Get("model").As<Napi::String>();
-  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
+    Napi::Object res = Napi::Array::New(env, result.size());
+    for (uint64_t i = 0; i < result.size(); ++i) {
+        Napi::Object tmp = Napi::Array::New(env, 3);
+        for (uint64_t j = 0; j < 3; ++j) {
+            tmp[j] = Napi::String::New(env, result[i][j]);
+        }
+        res[i] = tmp;
+    }

-  params.language = language;
-  params.model = model;
-  params.fname_inp.emplace_back(input);
-
-  Napi::Function callback = info[1].As<Napi::Function>();
-  Worker* worker = new Worker(callback, params);
-  worker->Queue();
-  return env.Undefined();
+    return res;
 }


 Napi::Object Init(Napi::Env env, Napi::Object exports) {
-  exports.Set(
-      Napi::String::New(env, "whisper"),
-      Napi::Function::New(env, whisper)
-  );
-  return exports;
+    exports.Set(
+            Napi::String::New(env, "whisper"),
+            Napi::Function::New(env, whisper)
+    );
+    return exports;
 }

 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,36 +1,27 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
+const path = require('path');
+const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));

 const whisperParams = {
-  language: "en",
-  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-  fname_inp: "../../samples/jfk.wav",
+    language: 'en',
+    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
+    fname_inp: '',
 };

 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
-  arguments.reduce((pre, item) => {
-    if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
-    }
-    return pre;
-  }, [])
+    arguments.reduce((pre, item) => {
+        if (item.startsWith("--")) {
+            return [...pre, item.slice(2).split("=")];
+        }
+        return pre;
+    }, []),
 );

 for (const key in params) {
-  if (whisperParams.hasOwnProperty(key)) {
-    whisperParams[key] = params[key];
-  }
+    if (whisperParams.hasOwnProperty(key)) {
+        whisperParams[key] = params[key];
+    }
 }

-console.log("whisperParams =", whisperParams);
-
-whisperAsync(whisperParams).then((result) => {
-  console.log(`Result from whisper: ${result}`);
-});
+console.log('whisperParams =', whisperParams);
+console.log(whisper(whisperParams));
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -109,6 +109,73 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+struct whisper_logits_filter_user_data {
+    std::vector<std::string>                * allowed_commands;
+    std::vector<std::vector<whisper_token>> * allowed_tokens;
+};
+
+void whisper_logits_filter(
+            struct whisper_context * ctx,
+          const whisper_token_data * tokens,
+                               int   n_tokens,
+                             float * logits,
+                              void * user_data){
+    const auto & allowed_tokens = *((whisper_logits_filter_user_data *) user_data)->allowed_tokens;
+
+    printf("n_tokens = %d\n", n_tokens);
+    for (int i = 0; i < n_tokens; i++) {
+        printf(" - '%s' (%.2f)\n", whisper_token_to_str(ctx, tokens[i].id), logits[i]);
+    }
+
+    if (n_tokens == 0) {
+        return;
+    }
+
+    std::vector<std::pair<whisper_token, float>> pool;
+    for (int i = 0; i < (int) allowed_tokens.size(); i++) {
+        const int n = (int) allowed_tokens[i].size();
+        if (n_tokens > n) {
+            continue;
+        }
+
+        const whisper_token id = allowed_tokens[i][n_tokens - 1];
+        pool.push_back({ id, logits[id] });
+    }
+
+    if (pool.empty()) {
+        return;
+    }
+
+    printf("applying logits filter, pool size = %d\n", (int) pool.size());
+
+    const int ibeg = whisper_token_beg(ctx);
+
+    double sum_all = 0.0;
+    for (int i = 0; i < ibeg; ++i) {
+        if (logits[i] == -INFINITY) {
+            continue;
+        }
+        sum_all += logits[i];
+    }
+
+    double sum_pool = 0.0;
+    for (int i = 0; i < (int) pool.size(); ++i) {
+        sum_pool += pool[i].second;
+    }
+
+    printf("sum_all = %.2f, sum_pool = %.2f\n", sum_all, sum_pool);
+
+    for (int i = 0; i < ibeg; ++i) {
+        logits[i] = -INFINITY;
+    }
+
+    for (int i = 0; i < (int) pool.size(); ++i) {
+        //logits[pool[i].first] = pool[i].second / sum_pool * sum_all;
+        logits[pool[i].first] = pool[i].second;
+        printf(" - '%s' (%.2f)\n", whisper_token_to_str(ctx, pool[i].first), logits[pool[i].first]);
+    }
+}
+
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -131,6 +198,8 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;

+    wparams.temperature_inc  = -1.0f;
+
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }
@ -334,22 +403,31 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
            wparams.translate        = params.translate;
            wparams.no_context       = true;
            wparams.single_segment   = true;
-            wparams.max_tokens       = 1;
+            //wparams.max_tokens       = 1;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;

            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

+            wparams.temperature_inc  = -1.0f;
+
            wparams.prompt_tokens    = k_tokens.data();
            wparams.prompt_n_tokens  = k_tokens.size();

+            whisper_logits_filter_user_data user_data = { &allowed_commands, &allowed_tokens };
+
+            wparams.logits_filter_callback           = whisper_logits_filter;
+            wparams.logits_filter_callback_user_data = &user_data;
+
            // run the transformer and a single decoding pass
            if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
                fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
                break;
            }

+            fprintf(stdout, "%s: text - '%s'\n", __func__, whisper_full_get_segment_text(ctx, 0));
+
            // estimate command probability
            // NOTE: not optimal
            {
@ -436,7 +514,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const

 // always-prompt mode
 // transcribe the voice into text after valid prompt
-int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+int process_always_prompt(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;

@ -496,7 +574,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
                const float sim = similarity(prompt, k_prompt);

                //debug
-                //fprintf(stdout, "command size: %i\n", command_length);
+                //fprintf(stdout, "command size: %d, sim: %f\n", (int) command.size(), sim);

                if ((sim > 0.7f) && (command.size() > 0)) {
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
@ -676,7 +754,7 @@ int main(int argc, char ** argv) {
    if (!params.commands.empty()) {
        ret_val = process_command_list(ctx, audio, params);
    } else if (!params.prompt.empty()) {
-        ret_val = always_prompt_transcription(ctx, audio, params);
+        ret_val = process_always_prompt(ctx, audio, params);
    } else {
        ret_val = process_general_transcription(ctx, audio, params);
    }
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -1,13 +1,13 @@
 #pragma once

-#include <SDL.h>
-#include <SDL_audio.h>
-
 #include <atomic>
 #include <cstdint>
 #include <vector>
 #include <mutex>

+#include <SDL.h>
+#include <SDL_audio.h>
+
 //
 // SDL Audio capture
 //
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,7 +31,6 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
-  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -73,7 +73,6 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
-    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -81,7 +80,6 @@ struct whisper_params {

    std::string language = "en";
    std::string prompt;
-    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
@ -129,9 +127,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
-        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@ -178,9 +174,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -199,7 +193,7 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+void whisper_print_segment(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -358,157 +352,28 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    const int n_segments = whisper_full_n_segments(ctx);
-    fout << "start,end,text\n";
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
+        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
    }

    return true;
 }

-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "},\n");
-    };
-
-    auto start_obj = [&](const char *name = nullptr) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end = false) {
-        start_value(name);
-        fout << "\"" << val << (end ? "\"\n" : "\",\n");
-    };
-
-    auto end_value = [&](bool end = false) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end = false) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj();
-        value_s("systeminfo", whisper_print_system_info());
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx));
-            value_b("multilingual", whisper_is_multilingual(ctx));
-            value_i("vocab", whisper_model_n_vocab(ctx));
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx));
-                value_i("state", whisper_model_n_audio_state(ctx));
-                value_i("head", whisper_model_n_audio_head(ctx));
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj();
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx));
-                value_i("state", whisper_model_n_text_state(ctx));
-                value_i("head", whisper_model_n_text_head(ctx));
-                value_i("leyer", whisper_model_n_text_layer(ctx), true);
-            end_obj();
-            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
-        end_obj();
-        start_obj("params");
-            value_s("model", params.model.c_str());
-            value_s("language", params.language.c_str());
-            value_b("translate", params.translate, true);
-        end_obj();
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj();
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj();
-                    start_obj("timestanps");
-                        value_s("from", to_timestamp(t0, true).c_str());
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj();
-                    start_obj("offsets");
-                        value_i("from", t0 * 10);
-                        value_i("to", t1 * 10, true);
-                    end_obj();
-                    value_s("text", text, true);
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

-    static const char * font = params.font_path.c_str();
-
-    std::ifstream fin(font);
-    if (!fin.is_open()) {
-        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
-        return false;
-    }
+    // TODO: become parameter
+    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";

    fout << "#!/bin/bash" << "\n";
    fout << "\n";
@ -732,7 +597,7 @@ int main(int argc, char ** argv) {

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback           = whisper_print_segment;
                wparams.new_segment_callback_user_data = &user_data;
            }

@ -742,7 +607,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -788,12 +653,6 @@ int main(int argc, char ** argv) {
                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params);
-            }
        }
    }

--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -288,6 +288,7 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
+            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
--- a/examples/talk.llama/.gitignore
+++ b/examples/talk.llama/.gitignore
@ -1,2 +0,0 @@
-eleven-labs.py
-audio.mp3
--- a/examples/talk.llama/CMakeLists.txt
+++ b/examples/talk.llama/CMakeLists.txt
@ -1,12 +0,0 @@
-if (WHISPER_SUPPORT_SDL2)
-    # talk.llama
-    set(TARGET talk-llama)
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp)
-
-    include(DefaultTargetOptions)
-
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-endif ()
--- a/examples/talk.llama/README.md
+++ b/examples/talk.llama/README.md
@ -1,2 +0,0 @@
-# talk.llama
-
--- a/examples/talk.llama/llama.cpp
+++ b/examples/talk.llama/llama.cpp
--- a/examples/talk.llama/llama.h
+++ b/examples/talk.llama/llama.h
@ -1,153 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    ifdef _WIN32
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-
-        float p;     // probability of the token
-        float plog;  // log probability of the token
-
-    } llama_token_data;
-
-    typedef void (*llama_progress_callback)(double progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-                   int   itype,
-                   int   qk);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                     double   top_p,
-                     double   temp,
-                     double   repeat_penalty);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/examples/talk.llama/speak.sh
+++ b/examples/talk.llama/speak.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# for Mac
-say "$2"
-
-# Eleven Labs
-#
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2" >/dev/null 2>&1
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk.llama/talk-llama.cpp
+++ b/examples/talk.llama/talk-llama.cpp
@ -1,511 +0,0 @@
-// Talk with AI
-//
-
-#include "common.h"
-#include "common-sdl.h"
-#include "whisper.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
-    int32_t audio_ctx  = 0;
-
-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-
-    std::string person      = "Santa";
-    std::string language    = "en";
-    std::string model_wsp   = "models/ggml-base.en.bin";
-    std::string model_llama = "models/ggml-llama-7B.bin";
-    std::string speak       = "./examples/talk/speak.sh";
-    std::string fname_out;
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -mg FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = params.n_threads;
-
-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-// need to have leading ' '
-//const std::string k_prompt = R"( Transcript of a dialog, where {1} interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer {1}'s requests immediately and with precision.
-//
-//{0}: Hello, Bob.
-//{1}: Hello {0}. How may I help you today?
-//{0}:)";
-
-const std::string k_prompt = R"( Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{1} answers responds with short and concise answers.
-
-{0}{4} Hello, {1}!
-{1}{4} Hello {0}! How may I help you today?
-{0}{4} What time is it?
-{1}{4} It is {2} o'clock.
-{0}{4} What year is it?
-{1}{4} We are in {3}.
-{0}{4} What is a cat?
-{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-{0}{4} Name a color.
-{1}{4} Blue
-{0}{4})";
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
-
-    // llama init
-
-    auto lparams = llama_context_default_params();
-
-    lparams.n_ctx      = 512;
-    lparams.n_parts    = 2; // TODO fix
-    lparams.seed       = 1; // TODO fix
-    lparams.f16_kv     = true;
-
-    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = false;
-
-    float prob0 = 0.0f;
-
-    const std::string chat_symb = ":";
-    const std::string bot_name = "LLAMA";
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    std::string prompt_org = k_prompt;
-    prompt_org = ::replace(prompt_org, "{0}", params.person);
-    prompt_org = ::replace(prompt_org, "{1}", bot_name);
-
-    {
-        // get time string
-        std::string time_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%H:%M", now);
-            time_str = buf;
-        }
-        prompt_org = ::replace(prompt_org, "{2}", time_str);
-    }
-
-    {
-        // get year string
-        std::string year_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%Y", now);
-            year_str = buf;
-        }
-        prompt_org = ::replace(prompt_org, "{3}", year_str);
-    }
-
-    prompt_org = ::replace(prompt_org, "{4}", chat_symb);
-
-    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_org, true);
-
-    const int n_ctx = llama_n_ctx(ctx_llama);
-
-    printf("\n");
-    printf("%s : initializing - please wait ...\n", __func__);
-
-    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return 1;
-    }
-
-    //fprintf(stdout, "\n");
-    //fprintf(stdout, "%s", prompt_org.c_str());
-    //fflush(stdout);
-
-    printf("%s : done! start speaking in the microphone\n", __func__);
-    printf("\n");
-    printf("%s%s", params.person.c_str(), chat_symb.c_str());
-    fflush(stdout);
-
-    audio.clear();
-
-    const int n_keep = embd_inp.size();
-    const int voice_id = 2;
-
-    int n_past = n_keep;
-    int n_prev = 64; // TODO arg
-
-    std::vector<llama_token> embd;
-
-    std::vector<std::string> antiprompts = {
-        params.person + chat_symb,
-    };
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        is_running = sdl_poll_events();
-
-        if (!is_running) {
-            break;
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(2000, pcmf32_cur);
-
-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard;
-
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
-
-                if (text_heard.empty() || tokens.empty() || force_speak) {
-                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                force_speak = false;
-
-                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + bot_name + chat_symb;
-                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
-                fflush(stdout);
-
-                embd = ::llama_tokenize(ctx_llama, text_heard, false);
-
-                // text inference
-                bool done = false;
-                std::string text_to_speak;
-                while (true) {
-                    // predict
-                    if (embd.size() > 0) {
-                        if (n_past + (int) embd.size() > n_ctx) {
-                            n_past = n_keep;
-
-                            // insert n_left/2 tokens at the start of embd from last_n_tokens
-                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
-                            //printf("\n---\n");
-                            //printf("resetting: '");
-                            //for (int i = 0; i < (int) embd.size(); i++) {
-                            //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
-                            //}
-                            //printf("'\n");
-                            //printf("\n---\n");
-                        }
-
-                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
-                            fprintf(stderr, "%s : failed to eval\n", __func__);
-                            return 1;
-                        }
-                    }
-
-                    //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
-
-                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
-                    n_past += embd.size();
-                    embd.clear();
-
-                    if (done) break;
-
-                    {
-                        // out of user input, sample next token
-                        const float top_k          = 5;
-                        const float top_p          = 0.80f;
-                        const float temp           = 0.30f;
-                        const float repeat_penalty = 1.1764f;
-
-                        const int repeat_last_n    = 256;
-
-                        llama_token id = 0;
-
-                        {
-                            //auto logits = llama_get_logits(ctx_llama);
-                            //logits[llama_token_eos()] = 0;
-
-                            id = llama_sample_top_p_top_k(ctx_llama,
-                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
-                        }
-
-                        if (id != llama_token_eos()) {
-                            // add it to the context
-                            embd.push_back(id);
-
-                            text_to_speak += llama_token_to_str(ctx_llama, id);
-
-                            printf("%s", llama_token_to_str(ctx_llama, id));
-                        } else {
-                            // TODO
-                            printf("EOS TOKEN - SHOULD NOT HAPPEN\n");
-                            exit(0);
-                        }
-                    }
-
-                    {
-                        std::string last_output;
-                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-                            last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
-                        }
-                        last_output += llama_token_to_str(ctx_llama, embd[0]);
-
-                        for (std::string & antiprompt : antiprompts) {
-                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                                done = true;
-                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
-                                fflush(stdout);
-                                break;
-                            }
-                        }
-                    }
-
-                    is_running = sdl_poll_events();
-
-                    if (!is_running) {
-                        break;
-                    }
-                }
-
-                text_to_speak = ::replace(text_to_speak, "\"", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    return 0;
-}
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.

-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,7 +2,6 @@ package com.whispercppdemo.ui.main

 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
-import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -20,7 +19,6 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
-        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -32,7 +30,6 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
-    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -48,11 +45,8 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Column(verticalArrangement = Arrangement.SpaceBetween) {
-                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
-                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
-                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
-                }
+            Row(horizontalArrangement = Arrangement.SpaceBetween) {
+                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -66,16 +60,7 @@ private fun MainScreen(

@Composable
 private fun MessageLog(log: String) {
-    SelectionContainer() {
-        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
-    }
-}
-
-@Composable
-private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
-    Button(onClick = onClick, enabled = enabled) {
-        Text("Benchmark")
-    }
+    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
 }

@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,15 +41,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {

    init {
        viewModelScope.launch {
-            printSystemInfo()
            loadData()
        }
    }

-    private suspend fun printSystemInfo() {
-        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
-    }
-
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -86,29 +81,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }

-    fun benchmark() = viewModelScope.launch {
-        runBenchmark(6)
-    }
-
    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }

-    private suspend fun runBenchmark(nthreads: Int) {
-        if (!canTranscribe) {
-            return
-        }
-
-        canTranscribe = false
-
-        printMessage("Running benchmark. This will take minutes...\n")
-        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
-        printMessage("\n")
-        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
-
-        canTranscribe = true
-    }
-
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -138,14 +114,11 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false

        try {
-            printMessage("Reading wave samples... ")
+            printMessage("Reading wave samples...\n")
            val data = readAudioSamples(file)
-            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
-            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            val elapsed = System.currentTimeMillis() - start
-            printMessage("Done ($elapsed ms): $text\n")
+            printMessage("Done: $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -27,14 +27,6 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }

-    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchMemcpy(nthreads)
-    }
-
-    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
-    }
-
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -74,10 +66,6 @@ class WhisperContext private constructor(private var ptr: Long) {
            }
            return WhisperContext(ptr)
        }
-
-        fun getSystemInfo(): String {
-            return WhisperLib.getSystemInfo()
-        }
    }
 }

@ -86,7 +74,6 @@ private class WhisperLib {
        init {
            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
            var loadVfpv4 = false
-            var loadV8fp16 = false
            if (isArmEabiV7a()) {
                // armeabi-v7a needs runtime detection support
                val cpuInfo = cpuInfo()
@ -97,24 +84,11 @@ private class WhisperLib {
                        loadVfpv4 = true
                    }
                }
-            } else if (isArmEabiV8a()) {
-                // ARMv8.2a needs runtime detection support
-                val cpuInfo = cpuInfo()
-                cpuInfo?.let {
-                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
-                    if (cpuInfo.contains("fphp")) {
-                        Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
-                        loadV8fp16 = true
-                    }
-                }
            }

            if (loadVfpv4) {
                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
                System.loadLibrary("whisper_vfpv4")
-            } else if (loadV8fp16) {
-                Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
-                System.loadLibrary("whisper_v8fp16_va")
            } else {
                Log.d(LOG_TAG, "Loading libwhisper.so")
                System.loadLibrary("whisper")
@ -129,9 +103,6 @@ private class WhisperLib {
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
-        external fun getSystemInfo(): String
-        external fun benchMemcpy(nthread: Int): String
-        external fun benchGgmlMulMat(nthread: Int): String
    }
 }

@ -139,10 +110,6 @@ private fun isArmEabiV7a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
 }

-private fun isArmEabiV8a(): Boolean {
-    return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
-}
-
 private fun cpuInfo(): String? {
    return try {
        File("/proc/cpuinfo").inputStream().bufferedReader().use {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -12,15 +12,4 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
-endif
-
-ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-	include $(CLEAR_VARS)
-	LOCAL_MODULE    := libwhisper_v8fp16_va
-	include $(LOCAL_PATH)/Whisper.mk
-	# Allow building NEON FMA code.
-	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
-	LOCAL_CFLAGS += -march=armv8.2-a+fp16
-	include $(BUILD_SHARED_LIBRARY)
-endif
-
+endif
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -6,7 +6,6 @@
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
-#include "ggml.h"

 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
@ -214,30 +213,4 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    const char *text = whisper_full_get_segment_text(context, index);
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
-        JNIEnv *env, jobject thiz
-) {
-    UNUSED(thiz);
-    const char *sysinfo = whisper_print_system_info();
-    jstring string = (*env)->NewStringUTF(env, sysinfo);
-    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
-                                                                      jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
-                                                                          jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
-}
+}
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
-
-In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,10 +296,6 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -7,9 +7,8 @@ To use:
 2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
 4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
-5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
+5. Select the "release" build configuration under "Run", then deploy and run to your device.

 [^1]: I recommend the tiny, base or small models for running on an iOS device.
-[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,10 +430,6 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/extra/bench-wts.sh
+++ b/extra/bench-wts.sh
@ -1,70 +0,0 @@
-# Benchmark word-level timestamps for different models
-#
-# This script takes two arguments
-# - an audio file
-# - [optional] path to a font file
-
-# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
-
-if [ -z "$1" ]; then
-    echo "Usage: $0 <audio file> [font file]"
-    exit 1
-fi
-
-#TODO: Make this a command line parameter
-#models="base small large"
-#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
-models="tiny.en base.en small.en medium.en large"
-
-DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
-DURATION=$(printf "%.2f" $DURATION)
-echo "Input file duration: ${DURATION}s"
-
-for model in $models; do
-    echo "Running $model"
-    COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
-
-    if [ ! -z "$2" ]; then
-        COMMAND="$COMMAND -fp $2"
-    fi
-    #TODO: Surface errors better
-    # TIMEFMT is for zsh, TIMEFORMAT is for bash
-    EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
-
-    # Slightly different formats between zsh and bash
-    if [ "${EXECTIME: -1}" == "s" ]; then
-        EXECTIME=${EXECTIME::-1}
-    fi
-
-    RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
-    RATIO=$(printf "%.2f" $RATIO)
-
-    echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
-
-    # If the file already exists, delete it
-    if [ -f $1.mp4 ]; then
-        rm $1.mp4
-    fi
-
-    bash $1.$model.wts >/dev/null 2>&1
-    mv $1.mp4 $1.$model.mp4
-
-    ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
-done
-
-COMMAND="ffmpeg -y"
-for model in $models; do
-    COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
-done
-COMMAND="$COMMAND -filter_complex \""
-COUNT=0
-for model in $models; do
-    COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
-    COUNT=$((COUNT+2))
-done
-COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
-
-echo $COMMAND
-
-# Run the command
-eval $COMMAND
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,8 +198,6 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -228,9 +226,7 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
-    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
-    GGML_OP_RMS_NORM,

    GGML_OP_MUL_MAT,

@ -330,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);

-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-
+size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -343,9 +336,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

-bool ggml_mlock_supported(void);
-bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -476,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@ -744,13 +726,6 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);

-//
-// quantization
-//
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-
 //
 // system info
 //
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:

 A third option to obtain the model files is to download them from Hugging Face:

-https://huggingface.co/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main

 ## Available models

--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]

-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
+with open(dir_model + "/vocab.json", "r") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
+with open(dir_model + "/added_tokens.json", "r") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r", encoding="utf8") as f:
+with open(dir_model + "/config.json", "r") as f:
    hparams = json.load(f)

 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -1,82 +0,0 @@
-#!/bin/bash
-
-# This script downloads Whisper model files that have already been converted to Core ML format.
-# This way you don't have to convert them yourself.
-
-src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
-pfx="resolve/main/ggml"
-
-# get the path of this script
-function get_script_path() {
-    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname $(realpath $0))"
-    else
-        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
-        echo "$ret"
-    fi
-}
-
-models_path="$(get_script_path)"
-
-# Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download Core ML model
-
-printf "Downloading Core ML model $model from '$src' ...\n"
-
-cd $models_path
-
-if [ -f "ggml-$model.mlmodel" ]; then
-    printf "Model $model already exists. Skipping download.\n"
-    exit 0
-fi
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download Core ML model $model \n"
-    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
-printf "Run the following command to compile it:\n\n"
-printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
-printf "\n"
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )

-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"

 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,7 +6,7 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"

-src="https://huggingface.co/ggerganov/whisper.cpp"
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"

 # get the path of this script
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -66,7 +66,6 @@ extern "C" {
    //

    struct whisper_context;
-    struct whisper_state;

    typedef int whisper_token;

@ -102,20 +101,11 @@ extern "C" {
    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);

-    // These are the same as the above, but the internal state of the context is not allocated automatically
-    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
-
-    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
-
-    // Frees all allocated memory
-    WHISPER_API void whisper_free      (struct whisper_context * ctx);
-    WHISPER_API void whisper_free_state(struct whisper_state * state);
+    // Frees all memory allocated by the model.
+    WHISPER_API void whisper_free(struct whisper_context * ctx);

    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
@ -123,30 +113,17 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

-    WHISPER_API int whisper_pcm_to_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
+    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
+        struct whisper_context* ctx,
+        const float* samples,
+        int   n_samples,
+        int   n_threads);

-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);

-    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
+    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
    // Returns 0 on success
@ -156,14 +133,7 @@ extern "C" {
                               int   n_len,
                               int   n_mel);

-    WHISPER_API int whisper_set_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
+    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
    // offset can be used to specify the offset of the first frame in the spectrogram.
    // Returns 0 on success
@ -172,12 +142,6 @@ extern "C" {
                               int   offset,
                               int   n_threads);

-    WHISPER_API int whisper_encode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset,
-                               int   n_threads);
-
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
    // tokens + n_tokens is the provided context for the decoder.
@ -191,14 +155,6 @@ extern "C" {
                               int   n_past,
                               int   n_threads);

-    WHISPER_API int whisper_decode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -234,44 +190,20 @@ extern "C" {
                               int   n_threads,
                             float * lang_probs);

-    WHISPER_API int whisper_lang_auto_detect_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
-    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
-    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
-
-    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
+    WHISPER_API int whisper_n_vocab        (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_audio_ctx    (struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);

    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
-    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
+    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
-    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
-

    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -286,7 +218,7 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_translate (void);
    WHISPER_API whisper_token whisper_token_transcribe(void);

-    // Performance information from the default state.
+    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

@ -304,19 +236,18 @@ extern "C" {
    // Text segment callback
    // Called on every newly generated text segment
    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
+    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);

    // Encoder begin callback
    // If not NULL, called before the encoder starts
    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);

    // Logits filter callback
    // Can be used to modify the logits before sampling
    // If not NULL, called after applying temperature to logits
    typedef void (*whisper_logits_filter_callback)(
            struct whisper_context * ctx,
-              struct whisper_state * state,
          const whisper_token_data * tokens,
                               int   n_tokens,
                             float * logits,
@ -403,7 +334,6 @@ extern "C" {
    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
-    // Not thread safe for same context
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
                struct whisper_context * ctx,
@ -411,16 +341,7 @@ extern "C" {
                           const float * samples,
                                   int   n_samples);

-    WHISPER_API int whisper_full_with_state(
-                struct whisper_context * ctx,
-                  struct whisper_state * state,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
-    // Result is stored in the default state of the context
-    // Not thread safe if executed in parallel on the same context.
+    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
@ -430,56 +351,40 @@ extern "C" {
                                   int   n_samples,
                                   int   n_processors);

-    // Number of generated text segments
+    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
-    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
+    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);

-    // Language id associated with the context's default state
+    // Language id associated with the current context
    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);

-    // Language id associated with the provided state
-    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
+    // Get the start and end time of the specified segment.
+    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);

-    // Get the start and end time of the specified segment
-    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
+    // Get the text of the specified segment.
+    WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);

-    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
+    // Get number of tokens in the specified segment.
+    WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);

-    // Get the text of the specified segment
-    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
+    // Get the token text of the specified token in the specified segment.
+    WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);

-    // Get number of tokens in the specified segment
-    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
-
-    // Get the token text of the specified token in the specified segment
-    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
-
-    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get token data for the specified token in the specified segment
+    // Get token data for the specified token in the specified segment.
    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);

-    // Get the probability of the specified token in the specified segment
-    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
+    // Get the probability of the specified token in the specified segment.
+    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

    ////////////////////////////////////////////////////////////////////////////

    // Temporary helpers needed for exposing ggml interface

    WHISPER_API int whisper_bench_memcpy(int n_threads);
-    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
-    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);

 #ifdef __cplusplus
 }