ggml : try to improve threading

2025-06-25 01:19:10 +00:00 · 2022-12-29 13:05:20 +02:00
115 changed files with 4477 additions and 13163 deletions
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@ -1,22 +0,0 @@
-name: Bindings Tests (Go)
-on:
-  push:
-    paths:
-      - bindings/go/**
-      - whisper.h
-  pull_request:
-    paths:
-      - bindings/go/**
-      - whisper.h
-
-jobs:
-  ubuntu-latest:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/setup-go@v3
-        with:
-          go-version: '^1.19'
-      - uses: actions/checkout@v1
-      - run: |
-          cd bindings/go
-          make test
--- a/.github/workflows/bindings-ruby.yml
+++ b/.github/workflows/bindings-ruby.yml
@ -1,22 +0,0 @@
-name: Bindings Tests (Ruby)
-on:
-  push:
-    paths:
-      - bindings/ruby/**
-      - whisper.h
-  pull_request:
-    paths:
-      - bindings/ruby/**
-      - whisper.h
-
-jobs:
-  ubuntu-latest:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: ruby/setup-ruby@v1
-        with:
-          ruby-version: '3.0'
-      - uses: actions/checkout@v1
-      - run: |
-          cd bindings/ruby/ext
-          ruby extconf.rb && make
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,267 +1,237 @@
 name: CI
-on: [push, pull_request]
+on: [push]

 jobs:
-  ubuntu-latest:
-    runs-on: ubuntu-latest
+    ubuntu-latest:
+        runs-on: ubuntu-latest

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install libsdl2-dev
+            - name: Dependencies
+              run: |
+                  sudo apt-get update
+                  sudo apt-get install build-essential
+                  sudo apt-get install libsdl2-dev

-      - name: Build
-        run: |
-          make
-          make stream
+            - name: Build
+              run: |
+                make
+                make stream

-  macOS-latest:
-    runs-on: macOS-latest
+    macOS-latest:
+        runs-on: macOS-latest

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Dependencies
-        run: |
-          brew update
-          brew install sdl2
+            - name: Dependencies
+              run: |
+                  brew update
+                  brew install sdl2

-      - name: Build
-        run: |
-          make
-          make stream
+            - name: Build
+              run: |
+                make
+                make stream

-  ubuntu-latest-gcc:
-    runs-on: ubuntu-latest
+    ubuntu-latest-gcc:
+        runs-on: ubuntu-latest

-    strategy:
-      matrix:
-        build: [Debug, Release]
+        strategy:
+            matrix:
+                build: [Debug, Release]

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-          sudo apt-get install libsdl2-dev
+            - name: Dependencies
+              run: |
+                  sudo apt-get update
+                  sudo apt-get install build-essential
+                  sudo apt-get install cmake
+                  sudo apt-get install libsdl2-dev

-      - name: Configure
-        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+            - name: Configure
+              run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}

-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+            - name: Build
+              run: |
+                make
+                ctest -L gh --output-on-failure

-  ubuntu-latest-clang:
-    runs-on: ubuntu-latest
+    ubuntu-latest-clang:
+        runs-on: ubuntu-latest

-    strategy:
-      matrix:
-        build: [Debug, Release]
+        strategy:
+            matrix:
+                build: [Debug, Release]

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-          sudo apt-get install libsdl2-dev
+            - name: Dependencies
+              run: |
+                  sudo apt-get update
+                  sudo apt-get install build-essential
+                  sudo apt-get install cmake
+                  sudo apt-get install libsdl2-dev

-      - name: Configure
-        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+            - name: Configure
+              run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang

-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+            - name: Build
+              run: |
+                make
+                ctest -L gh --output-on-failure

-  ubuntu-latest-gcc-sanitized:
-    runs-on: ubuntu-latest
+    ubuntu-latest-gcc-sanitized:
+        runs-on: ubuntu-latest

-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        strategy:
+            matrix:
+                sanitizer: [ADDRESS, THREAD, UNDEFINED]

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
+            - name: Dependencies
+              run: |
+                  sudo apt-get update
+                  sudo apt-get install build-essential
+                  sudo apt-get install cmake

-      - name: Configure
-        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
+            - name: Configure
+              run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON

-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+            - name: Build
+              run: |
+                make
+                ctest -L gh --output-on-failure

-  windows:
-    runs-on: windows-latest
+    windows:
+        runs-on: windows-latest

-    strategy:
-      matrix:
-        build: [Release]
-        arch: [Win32, x64]
-        sdl2: [ON]
-        include:
-          - arch: Win32
-            s2arc: x86
-          - arch: x64
-            s2arc: x64
-          - sdl2: ON
-            s2ver: 2.26.0
+        strategy:
+            matrix:
+                build: [Release]
+                arch: [Win32, x64]
+                sdl2: [ON]
+                include:
+                  - arch: Win32
+                    s2arc: x86
+                  - arch: x64
+                    s2arc: x64
+                  - sdl2: ON
+                    s2ver: 2.26.0

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
+            - name: Add msbuild to PATH
+              uses: microsoft/setup-msbuild@v1

-      - name: Fetch SDL2 and set SDL2_DIR
-        if: matrix.sdl2 == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
-          7z x sdl2.zip
-          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+            - name: Fetch SDL2 and set SDL2_DIR
+              if: matrix.sdl2 == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+                7z x sdl2.zip
+                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

-      - name: Configure
-        run: >
-          cmake -S . -B ./build -A ${{ matrix.arch }}
-          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+            - name: Configure
+              run: >
+                cmake -S . -B ./build -A ${{ matrix.arch }}
+                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}

-      - name: Build
-        run: |
-          cd ./build
-          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+            - name: Build
+              run: |
+                cd ./build
+                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

-      - name: Copy SDL2.dll
-        if: matrix.sdl2 == 'ON'
-        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+            - name: Copy SDL2.dll
+              if: matrix.sdl2 == 'ON'
+              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

-      - name: Upload binaries
-        if: matrix.sdl2 == 'ON'
-        uses: actions/upload-artifact@v1
-        with:
-          name: whisper-bin-${{ matrix.arch }}
-          path: build/bin/${{ matrix.build }}
+            - name: Upload binaries
+              if: matrix.sdl2 == 'ON'
+              uses: actions/upload-artifact@v1
+              with:
+                name: whisper-bin-${{ matrix.arch }}
+                path: build/bin/${{ matrix.build }}

-  windows-blas:
-    runs-on: windows-latest
+    windows-blas:
+        runs-on: windows-latest

-    strategy:
-      matrix:
-        build: [Release]
-        arch: [Win32, x64]
-        blas: [ON]
-        sdl2: [ON]
-        include:
-          - arch: Win32
-            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-            s2arc: x86
-          - arch: x64
-            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-            s2arc: x64
-          - sdl2: ON
-            s2ver: 2.26.0
+        strategy:
+            matrix:
+                build: [Release]
+                arch: [Win32, x64]
+                blas: [ON]
+                sdl2: [ON]
+                include:
+                  - arch: Win32
+                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+                    s2arc: x86
+                  - arch: x64
+                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+                    s2arc: x64
+                  - sdl2: ON
+                    s2ver: 2.26.0

-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1

-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
+            - name: Add msbuild to PATH
+              uses: microsoft/setup-msbuild@v1

-      - name: Fetch OpenBLAS
-        if: matrix.blas == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-          7z x blas.zip -oblas -y
-          copy blas/include/cblas.h .
-          copy blas/include/openblas_config.h .
-          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+            - name: Fetch OpenBLAS
+              if: matrix.blas == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+                7z x blas.zip -oblas -y
+                copy blas/include/cblas.h .
+                copy blas/include/openblas_config.h .
+                echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV

-      - name: Fetch SDL2 and set SDL2_DIR
-        if: matrix.sdl2 == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
-          7z x sdl2.zip
-          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+            - name: Fetch SDL2 and set SDL2_DIR
+              if: matrix.sdl2 == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+                7z x sdl2.zip
+                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

-      - name: Configure
-        run: >
-          cmake -S . -B ./build -A ${{ matrix.arch }}
-          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+            - name: Configure
+              run: >
+                cmake -S . -B ./build -A ${{ matrix.arch }}
+                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
+                -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}

-      - name: Build
-        run: |
-          cd ./build
-          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+            - name: Build
+              run: |
+                cd ./build
+                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

-      - name: Copy libopenblas.dll
-        if: matrix.blas == 'ON'
-        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+            - name: Copy libopenblas.dll
+              if: matrix.blas == 'ON'
+              run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}

-      - name: Copy SDL2.dll
-        if: matrix.sdl2 == 'ON'
-        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+            - name: Copy SDL2.dll
+              if: matrix.sdl2 == 'ON'
+              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

-      - name: Upload binaries
-        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
-        uses: actions/upload-artifact@v1
-        with:
-          name: whisper-blas-bin-${{ matrix.arch }}
-          path: build/bin/${{ matrix.build }}
-
-  emscripten:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        build: [Release]
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        run: |
-          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-          tar -xvf master.tar.gz
-          emsdk-master/emsdk update
-          emsdk-master/emsdk install latest
-          emsdk-master/emsdk activate latest
-
-      - name: Configure
-        run: echo "tmp"
-
-      - name: Build
-        run: |
-          pushd emsdk-master
-          source ./emsdk_env.sh
-          popd
-          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          make
+            - name: Upload binaries
+              if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
+              uses: actions/upload-artifact@v1
+              with:
+                name: whisper-blas-bin-${{ matrix.arch }}
+                path: build/bin/${{ matrix.build }}
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@ -1,48 +0,0 @@
-name: Examples Tests
-on:
-  push:
-    paths:
-      - examples/addon.node/**
-      - whisper.h
-  pull_request:
-    paths:
-      - examples/addon.node/**
-      - whisper.h
-
-jobs:
-  addon_node-ubuntu-latest:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        node-version: [ 16.x, 18.x ]
-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-          sudo apt-get install libsdl2-dev
-
-      - name: Use Node.js ${{ matrix.node-version }}
-        uses: actions/setup-node@v1
-        with:
-          node-version: ${{ matrix.node-version }}
-          cache: 'npm'
-
-      - name: Install package.json dependencies
-        working-directory: ./examples/addon.node
-        run: npm install
-
-      - name: Compile addon.node
-        run: npx cmake-js compile -T whisper-addon -B Release
-
-      - name: Download test model
-        run: |
-          bash ./models/download-ggml-model.sh base.en
-      - name: Test
-        run: |
-          cd examples/addon.node
-          npm run test
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
 *.o
-*.a
 .cache/
 .vs/
 .vscode/
@ -9,8 +8,6 @@ build/
 build-em/
 build-debug/
 build-release/
-build-static/
-build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

@ -18,12 +15,9 @@ build-sanitize-thread/
 /stream
 /command
 /talk
-/talk-llama
 /bench

-arm_neon.h
 sync.sh
-libwhisper.a
 libwhisper.so
 compile_commands.json

@ -33,5 +27,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt
-
-*.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,16 +1,15 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.2.1)
-
-# Add path to modules
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+project(whisper.cpp VERSION 1.0.4)

+set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")

 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
-    include(GitVars)
-    include(BuildTypes)
+    include(cmake/GitVars.cmake)
+    include(cmake/BuildTypes.cmake)

    # configure project version
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
@ -53,7 +52,6 @@ if (APPLE)
    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
-    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -84,6 +82,9 @@ endif()

 # dependencies

+set(CMAKE_C_STANDARD   11)
+set(CMAKE_CXX_STANDARD 11)
+
 find_package(Threads REQUIRED)

 # on APPLE - include Accelerate framework
@ -130,7 +131,6 @@ if (WHISPER_ALL_WARNINGS)
            -Wcast-qual                     \
            -Wstrict-prototypes             \
            -Wpointer-arith                 \
-            -Wno-unused-function            \
        ")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
            -Wall                           \
@ -157,7 +157,6 @@ else()
    if (MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
    else()
        if (EMSCRIPTEN)
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
@ -169,12 +168,7 @@ else()
            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
            endif()
-            if(NOT WHISPER_NO_FMA)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-            endif()
-            if(NOT WHISPER_NO_F16C)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
        endif()
    endif()
 endif()
@ -196,8 +190,6 @@ add_library(${TARGET}
    whisper.cpp
    )

-include(DefaultTargetOptions)
-
 target_include_directories(${TARGET} PUBLIC
    .
    )
@ -228,13 +220,9 @@ target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )

-set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
-
 install(TARGETS ${TARGET}
    LIBRARY DESTINATION lib
    ARCHIVE DESTINATION lib/static
-    RUNTIME DESTINATION bin
-    PUBLIC_HEADER DESTINATION include
    )

 #
@ -247,7 +235,7 @@ add_subdirectory(bindings)
 # programs, examples and tests
 #

-if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (WHISPER_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
 endif ()
--- a/94
+++ b/94
@ -10,9 +10,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

-CCV := $(shell $(CC) --version | head -n 1)
-CXXV := $(shell $(CXX) --version | head -n 1)
-
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -30,16 +27,10 @@ endif
 # Compile flags
 #

-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =

-# ref: https://github.com/ggerganov/whisper.cpp/issues/37
-ifneq ($(wildcard /usr/include/musl/*),)
-	CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-endif
-
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -62,13 +53,10 @@ endif
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+ifeq ($(UNAME_M),x86_64)
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS += -mf16c
+		CFLAGS += -mfma -mf16c
 		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
 		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
 			CFLAGS += -mavx
 		endif
@ -93,10 +81,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 		ifneq (,$(findstring f16c,$(F16C_M)))
 			CFLAGS += -mf16c
 		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
 	else ifeq ($(UNAME_S),Haiku)
 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 		ifneq (,$(findstring avx,$(AVX1_M)))
@ -121,15 +105,11 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
-ifneq ($(filter ppc64%,$(UNAME_M)),)
+ifeq ($(UNAME_M),ppc64le)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
 		CFLAGS += -mpower9-vector
 	endif
-	# Require c++23's std::byteswap for big-endian support.
-	ifeq ($(UNAME_M),ppc64)
-		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
-	endif
 endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
@ -143,45 +123,25 @@ ifdef WHISPER_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_GPROF
-	CFLAGS   += -pg
-	CXXFLAGS += -pg
+	CFLAGS  += -pg
+	CXXFLAGS  += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
-	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
-	# 32-bit Raspberry Pi 1, 2, 3
-	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
+	# Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
-	# 32-bit ARM, for example on Armbian or possibly raspbian
-	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-	
-	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
-	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	# Raspberry Pi 4
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

-#
-# Print build information
-#
-
-$(info I whisper.cpp build info: )
-$(info I UNAME_S:  $(UNAME_S))
-$(info I UNAME_P:  $(UNAME_P))
-$(info I UNAME_M:  $(UNAME_M))
-$(info I CFLAGS:   $(CFLAGS))
-$(info I CXXFLAGS: $(CXXFLAGS))
-$(info I LDFLAGS:  $(LDFLAGS))
-$(info I CC:       $(CCV))
-$(info I CXX:      $(CXXV))
-$(info )
-
-default: main bench
+default: main

 #
 # Build library
@ -200,7 +160,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so

 #
 # Examples
@ -208,28 +168,22 @@ clean:

 CC_SDL=`sdl2-config --cflags --libs`

-SRC_COMMON = examples/common.cpp
-SRC_COMMON_SDL = examples/common-sdl.cpp
-
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h

+stream: examples/stream/stream.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+
+command: examples/command/command.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+
+talk: examples/talk/talk.cpp  examples/talk/gpt-2.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
+
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
-
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
-
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
-
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
-
 #
 # Audio samples
 #
--- a/README.md
+++ b/README.md
@ -4,16 +4,15 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+[Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
 - AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- Low memory usage (Flash Attention)
+- Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
@ -71,7 +70,7 @@ Now build the [main](examples/main) example and transcribe an audio file like th
 make

 # transcribe an audio file
-./main -f samples/jfk.wav
+./main -f input.wav
 ```

 ---
@ -89,38 +88,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,        --help              [default] show this help message and exit
-  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
-  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -otxt,     --output-txt        [false  ] output result in a text file
-  -ovtt,     --output-vtt        [false  ] output result in a vtt file
-  -osrt,     --output-srt        [false  ] output result in a srt file
-  -owts,     --output-words      [false  ] output script for generating karaoke video
-  -ocsv,     --output-csv        [false  ] output result in a CSV file
-  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
-  -ps,       --print-special     [false  ] print special tokens
-  -pc,       --print-colors      [false  ] print colors
-  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [true   ] do not print timestamps
-  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-             --prompt PROMPT     [       ] initial prompt
-  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
-  -f FNAME,  --file FNAME        [       ] input WAV file path
-
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path

 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -139,8 +127,7 @@ Running base.en on all samples in ./samples ...
 [+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
 ----------------------------------------------

-whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
-whisper_model_load: loading model
+whisper_model_load: loading model from 'models/ggml-base.en.bin'
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 512
@ -153,14 +140,13 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
-whisper_model_load: mem required  =  215.00 MB (+    6.00 MB per decoder)
-whisper_model_load: kv self size  =    5.25 MB
-whisper_model_load: kv cross size =   17.58 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: model ctx     =  140.60 MB
+whisper_model_load: mem_required  =  506.00 MB
+whisper_model_load: ggml ctx size =  140.60 MB
+whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |

 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

@ -168,13 +154,12 @@ main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 proc
 [00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.


-whisper_print_timings:     fallbacks =   0 p /   0 h
-whisper_print_timings:     load time =   113.81 ms
-whisper_print_timings:      mel time =    15.40 ms
-whisper_print_timings:   sample time =    11.58 ms /    27 runs (    0.43 ms per run)
-whisper_print_timings:   encode time =   266.60 ms /     1 runs (  266.60 ms per run)
-whisper_print_timings:   decode time =    66.11 ms /    27 runs (    2.45 ms per run)
-whisper_print_timings:    total time =   476.31 ms
+whisper_print_timings:     load time =   105.91 ms
+whisper_print_timings:      mel time =    24.62 ms
+whisper_print_timings:   sample time =     3.63 ms
+whisper_print_timings:   encode time =   324.71 ms / 54.12 ms per layer
+whisper_print_timings:   decode time =    83.58 ms / 13.93 ms per layer
+whisper_print_timings:    total time =   542.81 ms
 ```

 The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
@ -217,16 +202,26 @@ make large

 | Model  | Disk   | Mem     | SHA                                        |
 | ---    | ---    | ---     | ---                                        |
-| tiny   |  75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| tiny   |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| base   | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| small  | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

 ## Limitations

 - Inference only
- No GPU support (yet)
+- No GPU support
+- Very basic greedy sampling scheme - always pick up the token with highest probability.
+  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
+  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
+  to run the python code with the following parameters:
+
+  ```
+  whisper --best_of None --beam_size None ...
+  ```
+
+  In the future, `whisper.cpp` will support more sampling strategies.

 ## Another example

@ -239,8 +234,7 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 ```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8

-whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
-whisper_model_load: loading model
+whisper_model_load: loading model from 'models/ggml-medium.en.bin'
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 1024
@ -253,71 +247,65 @@ whisper_model_load: n_text_layer  = 24
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 4
-whisper_model_load: mem required  = 1720.00 MB (+   43.00 MB per decoder)
-whisper_model_load: kv self size  =   42.00 MB
-whisper_model_load: kv cross size =  140.62 MB
+whisper_model_load: mem_required  = 2610.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: model ctx     = 1462.35 MB
-whisper_model_load: model size    = 1462.12 MB
+whisper_model_load: ggml ctx size = 1644.97 MB
+whisper_model_load: memory size =   182.62 MB
+whisper_model_load: model size  =  1462.12 MB

-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
+main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...

-main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
+[00:00.000 --> 00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
+[00:08.000 --> 00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
+[00:17.000 --> 00:23.000]   A short time later, debris was seen falling from the skies above Texas.
+[00:23.000 --> 00:29.000]   The Columbia's lost. There are no survivors.
+[00:29.000 --> 00:32.000]   On board was a crew of seven.
+[00:32.000 --> 00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
+[00:39.000 --> 00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
+[00:48.000 --> 00:52.000]   a colonel in the Israeli Air Force.
+[00:52.000 --> 00:58.000]   These men and women assumed great risk in the service to all humanity.
+[00:58.000 --> 01:03.000]   In an age when space flight has come to seem almost routine,
+[01:03.000 --> 01:07.000]   it is easy to overlook the dangers of travel by rocket
+[01:07.000 --> 01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
+[01:12.000 --> 01:18.000]   These astronauts knew the dangers, and they faced them willingly,
+[01:18.000 --> 01:23.000]   knowing they had a high and noble purpose in life.
+[01:23.000 --> 01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
+[01:31.000 --> 01:36.000]   All Americans today are thinking as well of the families of these men and women
+[01:36.000 --> 01:40.000]   who have been given this sudden shock and grief.
+[01:40.000 --> 01:45.000]   You're not alone. Our entire nation grieves with you,
+[01:45.000 --> 01:52.000]   and those you love will always have the respect and gratitude of this country.
+[01:52.000 --> 01:56.000]   The cause in which they died will continue.
+[01:56.000 --> 02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
+[02:04.000 --> 02:11.000]   and the longing to understand. Our journey into space will go on.
+[02:11.000 --> 02:16.000]   In the skies today, we saw destruction and tragedy.
+[02:16.000 --> 02:22.000]   Yet farther than we can see, there is comfort and hope.
+[02:22.000 --> 02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
+[02:29.000 --> 02:35.000]   who created all these. He who brings out the starry hosts one by one
+[02:35.000 --> 02:39.000]   and calls them each by name."
+[02:39.000 --> 02:46.000]   Because of His great power and mighty strength, not one of them is missing.
+[02:46.000 --> 02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
+[02:55.000 --> 03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
+[03:01.000 --> 03:05.000]   yet we can pray that all are safely home.
+[03:05.000 --> 03:13.000]   May God bless the grieving families, and may God continue to bless America.
+[03:13.000 --> 03:41.000]   Audio


-[00:00:00.000 --> 00:00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
-[00:00:08.000 --> 00:00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
-[00:00:17.000 --> 00:00:23.000]   A short time later, debris was seen falling from the skies above Texas.
-[00:00:23.000 --> 00:00:29.000]   The Columbia's lost. There are no survivors.
-[00:00:29.000 --> 00:00:32.000]   On board was a crew of seven.
-[00:00:32.000 --> 00:00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
-[00:00:39.000 --> 00:00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
-[00:00:48.000 --> 00:00:52.000]   a colonel in the Israeli Air Force.
-[00:00:52.000 --> 00:00:58.000]   These men and women assumed great risk in the service to all humanity.
-[00:00:58.000 --> 00:01:03.000]   In an age when space flight has come to seem almost routine,
-[00:01:03.000 --> 00:01:07.000]   it is easy to overlook the dangers of travel by rocket
-[00:01:07.000 --> 00:01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
-[00:01:12.000 --> 00:01:18.000]   These astronauts knew the dangers, and they faced them willingly,
-[00:01:18.000 --> 00:01:23.000]   knowing they had a high and noble purpose in life.
-[00:01:23.000 --> 00:01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
-[00:01:31.000 --> 00:01:36.000]   All Americans today are thinking as well of the families of these men and women
-[00:01:36.000 --> 00:01:40.000]   who have been given this sudden shock and grief.
-[00:01:40.000 --> 00:01:45.000]   You're not alone. Our entire nation grieves with you,
-[00:01:45.000 --> 00:01:52.000]   and those you love will always have the respect and gratitude of this country.
-[00:01:52.000 --> 00:01:56.000]   The cause in which they died will continue.
-[00:01:56.000 --> 00:02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
-[00:02:04.000 --> 00:02:11.000]   and the longing to understand. Our journey into space will go on.
-[00:02:11.000 --> 00:02:16.000]   In the skies today, we saw destruction and tragedy.
-[00:02:16.000 --> 00:02:22.000]   Yet farther than we can see, there is comfort and hope.
-[00:02:22.000 --> 00:02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
-[00:02:29.000 --> 00:02:35.000]   who created all these. He who brings out the starry hosts one by one
-[00:02:35.000 --> 00:02:39.000]   and calls them each by name."
-[00:02:39.000 --> 00:02:46.000]   Because of His great power and mighty strength, not one of them is missing.
-[00:02:46.000 --> 00:02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
-[00:02:55.000 --> 00:03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
-[00:03:01.000 --> 00:03:05.000]   yet we can pray that all are safely home.
-[00:03:05.000 --> 00:03:13.000]   May God bless the grieving families, and may God continue to bless America.
-[00:03:13.000 --> 00:03:19.000]   [Silence]
-
-
-whisper_print_timings:     fallbacks =   1 p /   0 h
-whisper_print_timings:     load time =   569.03 ms
-whisper_print_timings:      mel time =   146.85 ms
-whisper_print_timings:   sample time =   238.66 ms /   553 runs (    0.43 ms per run)
-whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per run)
-whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
-whisper_print_timings:    total time = 32733.52 ms
+whisper_print_timings:     load time =   575.92 ms
+whisper_print_timings:      mel time =   230.60 ms
+whisper_print_timings:   sample time =    73.19 ms
+whisper_print_timings:   encode time = 19552.61 ms / 814.69 ms per layer
+whisper_print_timings:   decode time = 13249.96 ms / 552.08 ms per layer
+whisper_print_timings:    total time = 33686.27 ms
 ```
 </details>

 ## Real-time audio input example

 This is a naive example of performing real-time inference on audio from your microphone.
-The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
+The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

 ```java
-make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -332,14 +320,14 @@ to highlight words with high or low confidence:

 ## Controlling the length of the generated text segments (experimental)

-For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
+For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: 

 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 

 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

@ -363,7 +351,7 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 

 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

@ -433,19 +421,6 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 ---

-## Video comparison of different models
-
-Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
-
-```java
-./extra/bench-wts.sh samples/jfk.wav
-ffplay ./samples/jfk.wav.all.mp4
-```
-
-https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
-
---
-
 ## Benchmarks

 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -466,7 +441,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,17 +451,9 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
-  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
-  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
-  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
-  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
-  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)

 ## Examples

@ -500,7 +467,6 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
 | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
 | [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
-| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
 | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
 | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
 | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
--- a/bindings/go/.gitignore
+++ b/bindings/go/.gitignore
@ -1,2 +1,3 @@
 build
 models
+go.sum
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -1,27 +1,28 @@
-BUILD_DIR := build
-MODELS_DIR := models
+CMAKE := $(shell which cmake)
+BUILD_DIR := "build"
+MODELS_DIR := "models"
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../..)
-LIBRARY_PATH := $(abspath ../..)
+C_INCLUDE_PATH := "../.."

 all: clean whisper examples

 whisper: mkdir
 	@echo Build whisper
-	@${MAKE} -C ../.. libwhisper.a
+	@${CMAKE} -S ../.. -B ${BUILD_DIR} -D BUILD_SHARED_LIBS=off -D WHISPER_NO_AVX2=on
+	@${CMAKE} --build ${BUILD_DIR} --target whisper

 test: model-small whisper modtidy
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
+	@go test -v .
+	@go test -v ./pkg/whisper/...

 examples: $(EXAMPLES_DIR)

 model-small: mkdir examples/go-model-download
-	@${BUILD_DIR}/go-model-download -out models ggml-small.en.bin
+	@${BUILD_DIR}/go-model-download -out models small.en

 $(EXAMPLES_DIR): mkdir whisper modtidy
 	@echo Build example $(notdir $@)
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
+	@go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@

 mkdir:
 	@echo Mkdir ${BUILD_DIR}
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -74,27 +74,4 @@ And you can then test a model against samples with the following command:
 ./build/go-whisper -model models/ggml-tiny.en.bin samples/jfk.wav 
 ```

-## Using the bindings
-
-To use the bindings in your own software,
-
-  1. Import `github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper` (or `github.com/ggerganov/whisper.cpp/bindings/go` into your package;
-  2. Compile `libwhisper.a` (you can use `make whisper` in the `bindings/go` directory);
-  3. Link your go binary against whisper by setting the environment variables `C_INCLUDE_PATH` and `LIBRARY_PATH`
-     to point to the `whisper.h` file directory and `libwhisper.a` file directory respectively.
-
-Look at the `Makefile` in the `bindings/go` directory for an example.
-
-The API Documentation:
-
-  * https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go
-  * https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper
-
-Getting help:
-
-  * Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
-
-## License
-
-The license for the Go bindings is the same as the license for the rest of the whisper.cpp project, which is the MIT License. See the `LICENSE` file for more details.

--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,14 +17,15 @@ import (
 // CONSTANTS

 const (
-	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                      // Filename extension
-	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
+	srcUrl        = "https://huggingface.co/"                           // The location of the models
+	srcPathPrefix = "/datasets/ggerganov/whisper.cpp/resolve/main/ggml" // Filename prefix
+	srcExt        = ".bin"                                              // Filename extension
+	bufSize       = 1024 * 64                                           // Size of the buffer used for downloading the model
 )

 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
+	modelNames = []string{"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large-v1", "large"}
 )

 var (
@ -122,14 +123,11 @@ func GetModels() []string {

 // URLForModel returns the URL for the given model on huggingface.co
 func URLForModel(model string) (string, error) {
-	if filepath.Ext(model) != srcExt {
-		model += srcExt
-	}
 	url, err := url.Parse(srcUrl)
 	if err != nil {
 		return "", err
 	} else {
-		url.Path = filepath.Join(url.Path, model)
+		url.Path = srcPathPrefix + "-" + model + srcExt
 	}
 	return url.String(), nil
 }
--- a/bindings/go/examples/go-whisper/color.go
+++ b/bindings/go/examples/go-whisper/color.go
@ -1,22 +0,0 @@
-package main
-
-import "fmt"
-
-///////////////////////////////////////////////////////////////////////////////
-// CONSTANTS
-
-const (
-	Reset     = "\033[0m"
-	RGBPrefix = "\033[38;5;" // followed by RGB values in decimal format separated by colons
-	RGBSuffix = "m"
-)
-
-///////////////////////////////////////////////////////////////////////////////
-// PUBLIC METHODS
-
-// Colorize text with RGB values, from 0 to 23
-func Colorize(text string, v int) string {
-	// https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
-	// Grayscale colors are in the range 232-255
-	return RGBPrefix + fmt.Sprint(v%24+232) + RGBSuffix + text + Reset
-}
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@ -2,12 +2,6 @@ package main

 import (
 	"flag"
-	"fmt"
-	"strings"
-	"time"
-
-	// Packages
-	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 )

 ///////////////////////////////////////////////////////////////////////////////
@ -48,26 +42,6 @@ func (flags *Flags) GetLanguage() string {
 	return flags.Lookup("language").Value.String()
 }

-func (flags *Flags) IsTranslate() bool {
-	return flags.Lookup("translate").Value.(flag.Getter).Get().(bool)
-}
-
-func (flags *Flags) GetOffset() time.Duration {
-	return flags.Lookup("offset").Value.(flag.Getter).Get().(time.Duration)
-}
-
-func (flags *Flags) GetDuration() time.Duration {
-	return flags.Lookup("duration").Value.(flag.Getter).Get().(time.Duration)
-}
-
-func (flags *Flags) GetThreads() uint {
-	return flags.Lookup("threads").Value.(flag.Getter).Get().(uint)
-}
-
-func (flags *Flags) GetOut() string {
-	return strings.ToLower(flags.Lookup("out").Value.String())
-}
-
 func (flags *Flags) IsSpeedup() bool {
 	return flags.Lookup("speedup").Value.String() == "true"
 }
@ -76,81 +50,12 @@ func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }

-func (flags *Flags) IsColorize() bool {
-	return flags.Lookup("colorize").Value.String() == "true"
-}
-
-func (flags *Flags) GetMaxLen() uint {
-	return flags.Lookup("max-len").Value.(flag.Getter).Get().(uint)
-}
-
-func (flags *Flags) GetMaxTokens() uint {
-	return flags.Lookup("max-tokens").Value.(flag.Getter).Get().(uint)
-}
-
-func (flags *Flags) GetWordThreshold() float32 {
-	return float32(flags.Lookup("word-thold").Value.(flag.Getter).Get().(float64))
-}
-
-func (flags *Flags) SetParams(context whisper.Context) error {
-	if lang := flags.GetLanguage(); lang != "" && lang != "auto" {
-		fmt.Fprintf(flags.Output(), "Setting language to %q\n", lang)
-		if err := context.SetLanguage(lang); err != nil {
-			return err
-		}
-	}
-	if flags.IsTranslate() && context.IsMultilingual() {
-		fmt.Fprintf(flags.Output(), "Setting translate to true\n")
-		context.SetTranslate(true)
-	}
-	if offset := flags.GetOffset(); offset != 0 {
-		fmt.Fprintf(flags.Output(), "Setting offset to %v\n", offset)
-		context.SetOffset(offset)
-	}
-	if duration := flags.GetDuration(); duration != 0 {
-		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
-		context.SetDuration(duration)
-	}
-	if flags.IsSpeedup() {
-		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
-		context.SetSpeedup(true)
-	}
-	if threads := flags.GetThreads(); threads != 0 {
-		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
-		context.SetThreads(threads)
-	}
-	if max_len := flags.GetMaxLen(); max_len != 0 {
-		fmt.Fprintf(flags.Output(), "Setting max_segment_length to %d\n", max_len)
-		context.SetMaxSegmentLength(max_len)
-	}
-	if max_tokens := flags.GetMaxTokens(); max_tokens != 0 {
-		fmt.Fprintf(flags.Output(), "Setting max_tokens to %d\n", max_tokens)
-		context.SetMaxTokensPerSegment(max_tokens)
-	}
-	if word_threshold := flags.GetWordThreshold(); word_threshold != 0 {
-		fmt.Fprintf(flags.Output(), "Setting word_threshold to %f\n", word_threshold)
-		context.SetTokenThreshold(word_threshold)
-	}
-
-	// Return success
-	return nil
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

 func registerFlags(flag *Flags) {
 	flag.String("model", "", "Path to the model file")
-	flag.String("language", "", "Spoken language")
-	flag.Bool("translate", false, "Translate from source language to english")
-	flag.Duration("offset", 0, "Time offset")
-	flag.Duration("duration", 0, "Duration of audio to process")
-	flag.Uint("threads", 0, "Number of threads to use")
+	flag.String("language", "", "Language")
 	flag.Bool("speedup", false, "Enable speedup")
-	flag.Uint("max-len", 0, "Maximum segment length in characters")
-	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
-	flag.Float64("word-thold", 0, "Maximum segment score")
 	flag.Bool("tokens", false, "Display tokens")
-	flag.Bool("colorize", false, "Colorize tokens")
-	flag.String("out", "", "Output format (srt, none or leave as empty string)")
 }
--- a/bindings/go/examples/go-whisper/main.go
+++ b/bindings/go/examples/go-whisper/main.go
@ -35,7 +35,8 @@ func main() {

 	// Process files
 	for _, filename := range flags.Args() {
-		if err := Process(model, filename, flags); err != nil {
+		fmt.Println("Processing", filename)
+		if err := Process(model, filename, flags.GetLanguage(), flags.IsSpeedup(), flags.IsTokens()); err != nil {
 			fmt.Fprintln(os.Stderr, err)
 			continue
 		}
--- a/bindings/go/examples/go-whisper/process.go
+++ b/bindings/go/examples/go-whisper/process.go
@ -11,7 +11,7 @@ import (
 	wav "github.com/go-audio/wav"
 )

-func Process(model whisper.Model, path string, flags *Flags) error {
+func Process(model whisper.Model, path string, lang string, speedup, tokens bool) error {
 	var data []float32

 	// Create processing context
@ -20,22 +20,14 @@ func Process(model whisper.Model, path string, flags *Flags) error {
 		return err
 	}

-	// Set the parameters
-	if err := flags.SetParams(context); err != nil {
-		return err
-	}
-
-	fmt.Printf("\n%s\n", context.SystemInfo())
-
 	// Open the file
-	fmt.Fprintf(flags.Output(), "Loading %q\n", path)
 	fh, err := os.Open(path)
 	if err != nil {
 		return err
 	}
 	defer fh.Close()

-	// Decode the WAV file - load the full buffer
+	// Decode the WAV file
 	dec := wav.NewDecoder(fh)
 	if buf, err := dec.FullPCMBuffer(); err != nil {
 		return err
@ -47,86 +39,42 @@ func Process(model whisper.Model, path string, flags *Flags) error {
 		data = buf.AsFloat32Buffer().Data
 	}

-	// Segment callback when -tokens is specified
+	// Set the parameters
 	var cb whisper.SegmentCallback
-	if flags.IsTokens() {
+	if lang != "" {
+		if err := context.SetLanguage(lang); err != nil {
+			return err
+		}
+	}
+	if speedup {
+		context.SetSpeedup(true)
+	}
+	if tokens {
 		cb = func(segment whisper.Segment) {
-			fmt.Fprintf(flags.Output(), "%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
+			fmt.Printf("%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
 			for _, token := range segment.Tokens {
-				if flags.IsColorize() && context.IsText(token) {
-					fmt.Fprint(flags.Output(), Colorize(token.Text, int(token.P*24.0)), " ")
-				} else {
-					fmt.Fprint(flags.Output(), token.Text, " ")
-				}
+				fmt.Printf("%q ", token.Text)
 			}
-			fmt.Fprintln(flags.Output(), "")
-			fmt.Fprintln(flags.Output(), "")
+			fmt.Println("")
 		}
 	}

 	// Process the data
-	fmt.Fprintf(flags.Output(), "  ...processing %q\n", path)
-	context.ResetTimings()
 	if err := context.Process(data, cb); err != nil {
 		return err
 	}

-	context.PrintTimings()
-
 	// Print out the results
-	switch {
-	case flags.GetOut() == "srt":
-		return OutputSRT(os.Stdout, context)
-	case flags.GetOut() == "none":
-		return nil
-	default:
-		return Output(os.Stdout, context, flags.IsColorize())
-	}
-}
-
-// Output text as SRT file
-func OutputSRT(w io.Writer, context whisper.Context) error {
-	n := 1
 	for {
 		segment, err := context.NextSegment()
 		if err == io.EOF {
-			return nil
+			break
 		} else if err != nil {
 			return err
 		}
-		fmt.Fprintln(w, n)
-		fmt.Fprintln(w, srtTimestamp(segment.Start), " --> ", srtTimestamp(segment.End))
-		fmt.Fprintln(w, segment.Text)
-		fmt.Fprintln(w, "")
-		n++
+		fmt.Printf("[%6s->%6s] %s\n", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond), segment.Text)
 	}
-}

-// Output text to terminal
-func Output(w io.Writer, context whisper.Context, colorize bool) error {
-	for {
-		segment, err := context.NextSegment()
-		if err == io.EOF {
-			return nil
-		} else if err != nil {
-			return err
-		}
-		fmt.Fprintf(w, "[%6s->%6s]", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
-		if colorize {
-			for _, token := range segment.Tokens {
-				if !context.IsText(token) {
-					continue
-				}
-				fmt.Fprint(w, " ", Colorize(token.Text, int(token.P*24.0)))
-			}
-			fmt.Fprint(w, "\n")
-		} else {
-			fmt.Fprintln(w, " ", segment.Text)
-		}
-	}
-}
-
-// Return srtTimestamp
-func srtTimestamp(t time.Duration) string {
-	return fmt.Sprintf("%02d:%02d:%02d,%03d", t/time.Hour, (t%time.Hour)/time.Minute, (t%time.Minute)/time.Second, (t%time.Second)/time.Millisecond)
+	// Return success
+	return nil
 }
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@ -1,23 +0,0 @@
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
-github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
-github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
-github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
-github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
-github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
-github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -1,5 +1,8 @@
 package whisper

+// This file defines the whisper_token, whisper_token_data and whisper_full_params
+// structures, which are used by the whisper_full() function.
+
 import (
 	"fmt"
 )
@ -47,12 +50,7 @@ func (p *Params) SetSpeedup(v bool) {
 	p.speed_up = toBool(v)
 }

-// Set language id
 func (p *Params) SetLanguage(lang int) error {
-	if lang == -1 {
-		p.language = nil
-		return nil
-	}
 	str := C.whisper_lang_str(C.int(lang))
 	if str == nil {
 		return ErrInvalidLanguage
@ -62,7 +60,6 @@ func (p *Params) SetLanguage(lang int) error {
 	return nil
 }

-// Get language id
 func (p *Params) Language() int {
 	if p.language == nil {
 		return -1
@ -70,46 +67,18 @@ func (p *Params) Language() int {
 	return int(C.whisper_lang_id(p.language))
 }

-// Threads available
-func (p *Params) Threads() int {
-	return int(p.n_threads)
-}
-
-// Set number of threads to use
 func (p *Params) SetThreads(threads int) {
 	p.n_threads = C.int(threads)
 }

-// Set start offset in ms
 func (p *Params) SetOffset(offset_ms int) {
 	p.offset_ms = C.int(offset_ms)
 }

-// Set audio duration to process in ms
 func (p *Params) SetDuration(duration_ms int) {
 	p.duration_ms = C.int(duration_ms)
 }

-// Set timestamp token probability threshold (~0.01)
-func (p *Params) SetTokenThreshold(t float32) {
-	p.thold_pt = C.float(t)
-}
-
-// Set timestamp token sum probability threshold (~0.01)
-func (p *Params) SetTokenSumThreshold(t float32) {
-	p.thold_ptsum = C.float(t)
-}
-
-// Set max segment length in characters
-func (p *Params) SetMaxSegmentLength(n int) {
-	p.max_len = C.int(n)
-}
-
-// Set max tokens per segment (0 = no limit)
-func (p *Params) SetMaxTokensPerSegment(n int) {
-	p.max_tokens = C.int(n)
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

--- a/bindings/go/pkg/whisper/consts.go
+++ b/bindings/go/pkg/whisper/consts.go
@ -11,11 +11,10 @@ import (
 // ERRORS

 var (
-	ErrUnableToLoadModel    = errors.New("unable to load model")
-	ErrInternalAppError     = errors.New("internal application error")
-	ErrProcessingFailed     = errors.New("processing failed")
-	ErrUnsupportedLanguage  = errors.New("unsupported language")
-	ErrModelNotMultilingual = errors.New("model is not multilingual")
+	ErrUnableToLoadModel   = errors.New("unable to load model")
+	ErrInternalAppError    = errors.New("internal application error")
+	ErrProcessingFailed    = errors.New("processing failed")
+	ErrUnsupportedLanguage = errors.New("unsupported language")
 )

 ///////////////////////////////////////////////////////////////////////////////
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -1,9 +1,7 @@
 package whisper

 import (
-	"fmt"
 	"io"
-	"runtime"
 	"strings"
 	"time"

@ -26,7 +24,7 @@ var _ Context = (*context)(nil)
 ///////////////////////////////////////////////////////////////////////////////
 // LIFECYCLE

-func newContext(model *model, params whisper.Params) (Context, error) {
+func NewContext(model *model, params whisper.Params) (Context, error) {
 	context := new(context)
 	context.model = model
 	context.params = params
@ -43,13 +41,7 @@ func (context *context) SetLanguage(lang string) error {
 	if context.model.ctx == nil {
 		return ErrInternalAppError
 	}
-	if !context.model.IsMultilingual() {
-		return ErrModelNotMultilingual
-	}
-
-	if lang == "auto" {
-		context.params.SetLanguage(-1)
-	} else if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
+	if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
 		return ErrUnsupportedLanguage
 	} else if err := context.params.SetLanguage(id); err != nil {
 		return err
@ -58,94 +50,16 @@ func (context *context) SetLanguage(lang string) error {
 	return nil
 }

-func (context *context) IsMultilingual() bool {
-	return context.model.IsMultilingual()
-}
-
 // Get language
 func (context *context) Language() string {
-	id := context.params.Language()
-	if id == -1 {
-		return "auto"
-	}
 	return whisper.Whisper_lang_str(context.params.Language())
 }

-// Set translate flag
-func (context *context) SetTranslate(v bool) {
-	context.params.SetTranslate(v)
-}
-
 // Set speedup flag
 func (context *context) SetSpeedup(v bool) {
 	context.params.SetSpeedup(v)
 }

-// Set number of threads to use
-func (context *context) SetThreads(v uint) {
-	context.params.SetThreads(int(v))
-}
-
-// Set time offset
-func (context *context) SetOffset(v time.Duration) {
-	context.params.SetOffset(int(v.Milliseconds()))
-}
-
-// Set duration of audio to process
-func (context *context) SetDuration(v time.Duration) {
-	context.params.SetOffset(int(v.Milliseconds()))
-}
-
-// Set timestamp token probability threshold (~0.01)
-func (context *context) SetTokenThreshold(t float32) {
-	context.params.SetTokenThreshold(t)
-}
-
-// Set timestamp token sum probability threshold (~0.01)
-func (context *context) SetTokenSumThreshold(t float32) {
-	context.params.SetTokenSumThreshold(t)
-}
-
-// Set max segment length in characters
-func (context *context) SetMaxSegmentLength(n uint) {
-	context.params.SetMaxSegmentLength(int(n))
-}
-
-// Set max tokens per segment (0 = no limit)
-func (context *context) SetMaxTokensPerSegment(n uint) {
-	context.params.SetMaxTokensPerSegment(int(n))
-}
-
-// ResetTimings resets the mode timings. Should be called before processing
-func (context *context) ResetTimings() {
-	context.model.ctx.Whisper_reset_timings()
-}
-
-// PrintTimings prints the model timings to stdout.
-func (context *context) PrintTimings() {
-	context.model.ctx.Whisper_print_timings()
-}
-
-// SystemInfo returns the system information
-func (context *context) SystemInfo() string {
-	return fmt.Sprintf("system_info: n_threads = %d / %d | %s\n",
-		context.params.Threads(),
-		runtime.NumCPU(),
-		whisper.Whisper_print_system_info(),
-	)
-}
-
-// Use mel data at offset_ms to try and auto-detect the spoken language
-// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
-// Returns the probabilities of all languages.
-func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]float32, error) {
-	langProbs, err := context.model.ctx.Whisper_lang_auto_detect(offset_ms, n_threads)
-	if err != nil {
-		return nil, err
-	}
-	return langProbs, nil
-}
-
 // Process new sample data and return any errors
 func (context *context) Process(data []float32, cb SegmentCallback) error {
 	if context.model.ctx == nil {
@ -205,65 +119,6 @@ func (context *context) NextSegment() (Segment, error) {
 	return result, nil
 }

-// Test for text tokens
-func (context *context) IsText(t Token) bool {
-	switch {
-	case context.IsBEG(t):
-		return false
-	case context.IsSOT(t):
-		return false
-	case whisper.Token(t.Id) >= context.model.ctx.Whisper_token_eot():
-		return false
-	case context.IsPREV(t):
-		return false
-	case context.IsSOLM(t):
-		return false
-	case context.IsNOT(t):
-		return false
-	default:
-		return true
-	}
-}
-
-// Test for "begin" token
-func (context *context) IsBEG(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_beg()
-}
-
-// Test for "start of transcription" token
-func (context *context) IsSOT(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_sot()
-}
-
-// Test for "end of transcription" token
-func (context *context) IsEOT(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_eot()
-}
-
-// Test for "start of prev" token
-func (context *context) IsPREV(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_prev()
-}
-
-// Test for "start of lm" token
-func (context *context) IsSOLM(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_solm()
-}
-
-// Test for "No timestamps" token
-func (context *context) IsNOT(t Token) bool {
-	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_not()
-}
-
-// Test for token associated with a specific language
-func (context *context) IsLANG(t Token, lang string) bool {
-	if id := context.model.ctx.Whisper_lang_id(lang); id >= 0 {
-		return whisper.Token(t.Id) == context.model.ctx.Whisper_token_lang(id)
-	} else {
-		return false
-	}
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -20,28 +20,15 @@ type Model interface {
 	// Return a new speech-to-text context.
 	NewContext() (Context, error)

-	// Return true if the model is multilingual.
-	IsMultilingual() bool
-
 	// Return all languages supported.
 	Languages() []string
 }

 // Context is the speach recognition context.
 type Context interface {
-	SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
-	SetTranslate(bool)        // Set translate flag
-	IsMultilingual() bool     // Return true if the model is multilingual.
+	SetLanguage(string) error // Set the language to use for speech recognition.
 	Language() string         // Get language
-
-	SetOffset(time.Duration)      // Set offset
-	SetDuration(time.Duration)    // Set duration
-	SetThreads(uint)              // Set number of threads to use
-	SetSpeedup(bool)              // Set speedup flag
-	SetTokenThreshold(float32)    // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)     // Set max segment length in characters
-	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
+	SetSpeedup(bool)          // Set speedup flag

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
@ -51,21 +38,6 @@ type Context interface {
 	// After process is called, return segments until the end of the stream
 	// is reached, when io.EOF is returned.
 	NextSegment() (Segment, error)
-
-	IsBEG(Token) bool          // Test for "begin" token
-	IsSOT(Token) bool          // Test for "start of transcription" token
-	IsEOT(Token) bool          // Test for "end of transcription" token
-	IsPREV(Token) bool         // Test for "start of prev" token
-	IsSOLM(Token) bool         // Test for "start of lm" token
-	IsNOT(Token) bool          // Test for "No timestamps" token
-	IsLANG(Token, string) bool // Test for token associated with a specific language
-	IsText(Token) bool         // Test for text token
-
-	// Timings
-	PrintTimings()
-	ResetTimings()
-
-	SystemInfo() string
 }

 // Segment is the text result of a speech recognition.
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -23,7 +23,7 @@ var _ Model = (*model)(nil)
 ///////////////////////////////////////////////////////////////////////////////
 // LIFECYCLE

-func New(path string) (Model, error) {
+func New(path string) (*model, error) {
 	model := new(model)
 	if _, err := os.Stat(path); err != nil {
 		return nil, err
@ -64,11 +64,6 @@ func (model *model) String() string {
 ///////////////////////////////////////////////////////////////////////////////
 // PUBLIC METHODS

-// Return true if model is multilingual (language and translation options are supported)
-func (model *model) IsMultilingual() bool {
-	return model.ctx.Whisper_is_multilingual() != 0
-}
-
 // Return all recognized languages. Initially it is set to auto-detect
 func (model *model) Languages() []string {
 	result := make([]string, 0, whisper.Whisper_lang_max_id())
@ -94,8 +89,7 @@ func (model *model) NewContext() (Context, error) {
 	params.SetPrintRealtime(false)
 	params.SetPrintTimestamps(false)
 	params.SetThreads(runtime.NumCPU())
-	params.SetNoContext(true)

 	// Return new context
-	return newContext(model, params)
+	return NewContext(model, params)
 }
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -9,7 +9,8 @@ import (
 // CGO

 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++
+#cgo CFLAGS: -I${SRCDIR}/../..
+#cgo LDFLAGS: -L${SRCDIR}/build -lwhisper -lm -lstdc++
 #cgo darwin LDFLAGS: -framework Accelerate
 #include <whisper.h>
 #include <stdlib.h>
@ -20,7 +21,7 @@ extern bool callEncoderBegin(void* user_data);
 // Text segment callback
 // Called on every newly generated text segment
 // Use the whisper_full_...() functions to obtain the text segments
-static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
+static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        callNewSegment(user_data, n_new);
    }
@ -29,7 +30,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_s
 // Encoder begin callback
 // If not NULL, called before the encoder starts
 // If it returns false, the computation is aborted
-static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        return callEncoderBegin(user_data);
    }
@ -91,7 +92,7 @@ var (
 func Whisper_init(path string) *Context {
 	cPath := C.CString(path)
 	defer C.free(unsafe.Pointer(cPath))
-	if ctx := C.whisper_init_from_file(cPath); ctx != nil {
+	if ctx := C.whisper_init(cPath); ctx != nil {
 		return (*Context)(ctx)
 	} else {
 		return nil
@ -147,6 +148,16 @@ func (ctx *Context) Whisper_decode(tokens []Token, past, threads int) error {
 	}
 }

+// whisper_sample_best() returns the token with the highest probability
+func (ctx *Context) Whisper_sample_best() TokenData {
+	return TokenData(C.whisper_sample_best((*C.struct_whisper_context)(ctx)))
+}
+
+// whisper_sample_timestamp() returns the most probable timestamp token
+func (ctx *Context) Whisper_sample_timestamp(is_initial bool) TokenData {
+	return TokenData(C.whisper_sample_timestamp((*C.struct_whisper_context)(ctx), C.bool(is_initial)))
+}
+
 // Convert the provided text into tokens. The tokens pointer must be large enough to hold the resulting tokens.
 // Returns the number of tokens on success
 func (ctx *Context) Whisper_tokenize(text string, tokens []Token) (int, error) {
@ -160,10 +171,6 @@ func (ctx *Context) Whisper_tokenize(text string, tokens []Token) (int, error) {
 }

 // Return the id of the specified language, returns -1 if not found
-// Examples:
-//
-//	"de" -> 2
-//	"german" -> 2
 func (ctx *Context) Whisper_lang_id(lang string) int {
 	return int(C.whisper_lang_id(C.CString(lang)))
 }
@ -204,10 +211,6 @@ func (ctx *Context) Whisper_n_text_ctx() int {
 	return int(C.whisper_n_text_ctx((*C.struct_whisper_context)(ctx)))
 }

-func (ctx *Context) Whisper_n_audio_ctx() int {
-	return int(C.whisper_n_audio_ctx((*C.struct_whisper_context)(ctx)))
-}
-
 func (ctx *Context) Whisper_is_multilingual() int {
 	return int(C.whisper_is_multilingual((*C.struct_whisper_context)(ctx)))
 }
--- a/bindings/go/whisper_test.go
+++ b/bindings/go/whisper_test.go
@ -50,10 +50,7 @@ func Test_Whisper_001(t *testing.T) {
 	ctx := whisper.Whisper_init(ModelPath)
 	assert.NotNil(ctx)
 	defer ctx.Whisper_free()
-	params := ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
-	data := buf.AsFloat32Buffer().Data
-	err = ctx.Whisper_full(params, data, nil, nil)
-	assert.NoError(err)
+	assert.NoError(ctx.Whisper_full(ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY), buf.AsFloat32Buffer().Data, nil, nil))

 	// Print out tokens
 	num_segments := ctx.Whisper_full_n_segments()
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -20,7 +20,7 @@ struct whisper_context * g_context;
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_context == nullptr) {
-            g_context = whisper_init_from_file(path_model.c_str());
+            g_context = whisper_init(path_model.c_str());
            if (g_context != nullptr) {
                return true;
            } else {
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.1",
+  "version": "1.0.4",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -1,7 +0,0 @@
-Makefile
-ggml.c
-ggml.h
-whisper.bundle
-whisper.cpp
-whisper.h
-dr_wav.h
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,21 +0,0 @@
-require 'mkmf'
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
-
-
-# need to use c++ compiler flags
-$CXXFLAGS << ' -std=c++11'
-# Set to true when building binary gems
-if enable_config('static-stdlib', false)
-  $LDFLAGS << ' -static-libgcc -static-libstdc++'
-end
-
-if enable_config('march-tune-native', false)
-  $CFLAGS << ' -march=native -mtune=native'
-  $CXXFLAGS << ' -march=native -mtune=native'
-end
-
-create_makefile('whisper')
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -1,426 +0,0 @@
-#include <ruby.h>
-#include "ruby_whisper.h"
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-#include <cmath>
-#include <fstream>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <vector>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BOOL_PARAMS_SETTER(self, prop, value) \
-  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
-  if (value == Qfalse || value == Qnil) { \
-    rwp->params.prop = false; \
-  } else { \
-    rwp->params.prop = true; \
-  } \
-  return value; \
-
-#define BOOL_PARAMS_GETTER(self,  prop) \
-  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
-  if (rwp->params.prop) { \
-    return Qtrue; \
-  } else { \
-    return Qfalse; \
-  }
-
-VALUE mWhisper;
-VALUE cContext;
-VALUE cParams;
-
-static void ruby_whisper_free(ruby_whisper *rw) {
-  if (rw->context) {
-    whisper_free(rw->context);
-    rw->context = NULL;
-  }
-}
-static void ruby_whisper_params_free(ruby_whisper_params *rwp) {
-}
-
-void rb_whisper_mark(ruby_whisper *rw) {
-  // call rb_gc_mark on any ruby references in rw
-}
-
-void rb_whisper_free(ruby_whisper *rw) {
-  ruby_whisper_free(rw);
-  free(rw);
-}
-
-void rb_whisper_params_mark(ruby_whisper_params *rwp) {
-}
-
-void rb_whisper_params_free(ruby_whisper_params *rwp) {
-  ruby_whisper_params_free(rwp);
-  free(rwp);
-}
-
-static VALUE ruby_whisper_allocate(VALUE klass) {
-  ruby_whisper *rw;
-  rw = ALLOC(ruby_whisper);
-  rw->context = NULL;
-  return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
-}
-
-static VALUE ruby_whisper_params_allocate(VALUE klass) {
-  ruby_whisper_params *rwp;
-  rwp = ALLOC(ruby_whisper_params);
-  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-  return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
-}
-
-static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
-  ruby_whisper *rw;
-  VALUE whisper_model_file_path;
-
-  // TODO: we can support init from buffer here too maybe another ruby object to expose
-  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
-  Data_Get_Struct(self, ruby_whisper, rw);
-
-  if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
-    rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
-  }
-  rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
-  if (rw->context == nullptr) {
-    rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
-  }
-  return self;
-}
-
-/*
- * transcribe a single file
- * can emit to a block results
- *
- **/
-static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
-  ruby_whisper *rw;
-  ruby_whisper_params *rwp;
-  VALUE wave_file_path, blk, params;
-
-  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
-  Data_Get_Struct(self, ruby_whisper, rw);
-  Data_Get_Struct(params, ruby_whisper_params, rwp);
-
-  if (!rb_respond_to(wave_file_path, rb_intern("to_s"))) {
-    rb_raise(rb_eRuntimeError, "Expected file path to wave file");
-  }
-
-  std::string fname_inp = StringValueCStr(wave_file_path);
-
-  std::vector<float> pcmf32; // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-  // WAV input - this is directly from main.cpp example
-  {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname_inp == "-") {
-      {
-        uint8_t buf[1024];
-        while (true) {
-          const size_t n = fread(buf, 1, sizeof(buf), stdin);
-          if (n == 0) {
-            break;
-          }
-          wav_data.insert(wav_data.end(), buf, buf + n);
-        }
-      }
-
-      if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open WAV file from stdin\n");
-        return self;
-      }
-
-      fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    } else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
-      fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-      fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) {
-      fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
-      return self;
-    }
-
-    if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-      fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
-      return self;
-    }
-
-    if (wav.bitsPerSample != 16) {
-      fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str());
-      return self;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[i])/32768.0f;
-      }
-    } else {
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-      }
-    }
-
-    if (rwp->diarize) {
-      // convert to stereo, float
-      pcmf32s.resize(2);
-
-      pcmf32s[0].resize(n);
-      pcmf32s[1].resize(n);
-      for (uint64_t i = 0; i < n; i++) {
-        pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-        pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-      }
-    }
-  }
-  {
-    static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-      bool is_aborted = *(bool*)user_data;
-      return !is_aborted;
-    };
-    rwp->params.encoder_begin_callback_user_data = &is_aborted;
-  }
-
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
-    fprintf(stderr, "failed to process audio\n");
-    return self;
-  }
-  const int n_segments = whisper_full_n_segments(rw->context);
-  VALUE output = rb_str_new2("");
-  for (int i = 0; i < n_segments; ++i) {
-    const char * text = whisper_full_get_segment_text(rw->context, i);
-    output = rb_str_concat(output, rb_str_new2(text));
-  }
-  VALUE idCall = rb_intern("call");
-  if (blk != Qnil) {
-    rb_funcall(blk, idCall, 1, output);
-  }
-  return self;
-}
-
-/*
- * params.language = "auto" | "en", etc...
- */
-static VALUE ruby_whisper_params_set_language(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (value == Qfalse || value == Qnil) {
-    rwp->params.language = "auto";
-  } else {
-    rwp->params.language = StringValueCStr(value);
-  }
-  return value;
-}
-static VALUE ruby_whisper_params_get_language(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (rwp->params.language) {
-    return rb_str_new2(rwp->params.language);
-  } else {
-    return rb_str_new2("auto");
-  }
-}
-static VALUE ruby_whisper_params_set_translate(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, translate, value)
-}
-static VALUE ruby_whisper_params_get_translate(VALUE self) {
-  BOOL_PARAMS_GETTER(self, translate)
-}
-static VALUE ruby_whisper_params_set_no_context(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, no_context, value)
-}
-static VALUE ruby_whisper_params_get_no_context(VALUE self) {
-  BOOL_PARAMS_GETTER(self, no_context)
-}
-static VALUE ruby_whisper_params_set_single_segment(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, single_segment, value)
-}
-static VALUE ruby_whisper_params_get_single_segment(VALUE self) {
-  BOOL_PARAMS_GETTER(self, single_segment)
-}
-static VALUE ruby_whisper_params_set_print_special(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_special, value)
-}
-static VALUE ruby_whisper_params_get_print_special(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_special)
-}
-static VALUE ruby_whisper_params_set_print_progress(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_progress, value)
-}
-static VALUE ruby_whisper_params_get_print_progress(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_progress)
-}
-static VALUE ruby_whisper_params_set_print_realtime(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_realtime, value)
-}
-static VALUE ruby_whisper_params_get_print_realtime(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_realtime)
-}
-static VALUE ruby_whisper_params_set_print_timestamps(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, print_timestamps, value)
-}
-static VALUE ruby_whisper_params_get_print_timestamps(VALUE self) {
-  BOOL_PARAMS_GETTER(self, print_timestamps)
-}
-static VALUE ruby_whisper_params_set_suppress_blank(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, suppress_blank, value)
-}
-static VALUE ruby_whisper_params_get_suppress_blank(VALUE self) {
-  BOOL_PARAMS_GETTER(self, suppress_blank)
-}
-static VALUE ruby_whisper_params_set_suppress_non_speech_tokens(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, suppress_non_speech_tokens, value)
-}
-static VALUE ruby_whisper_params_get_suppress_non_speech_tokens(VALUE self) {
-  BOOL_PARAMS_GETTER(self, suppress_non_speech_tokens)
-}
-static VALUE ruby_whisper_params_get_token_timestamps(VALUE self) {
-  BOOL_PARAMS_GETTER(self, token_timestamps)
-}
-static VALUE ruby_whisper_params_set_token_timestamps(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, token_timestamps, value)
-}
-static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
-  BOOL_PARAMS_GETTER(self, split_on_word)
-}
-static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, split_on_word, value)
-}
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
-static VALUE ruby_whisper_params_get_diarize(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (rwp->diarize) {
-    return Qtrue;
-  } else {
-    return Qfalse;
-  }
-}
-static VALUE ruby_whisper_params_set_diarize(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  if (value == Qfalse || value == Qnil) {
-    rwp->diarize = false;
-  } else {
-    rwp->diarize = true;
-  } \
-  return value;
-}
-
-static VALUE ruby_whisper_params_get_offset(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.offset_ms);
-}
-static VALUE ruby_whisper_params_set_offset(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.offset_ms = NUM2INT(value);
-  return value;
-}
-static VALUE ruby_whisper_params_get_duration(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.duration_ms);
-}
-static VALUE ruby_whisper_params_set_duration(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.duration_ms = NUM2INT(value);
-  return value;
-}
-
-static VALUE ruby_whisper_params_get_max_text_tokens(VALUE self) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  return INT2NUM(rwp->params.n_max_text_ctx);
-}
-static VALUE ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value) {
-  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
-  rwp->params.n_max_text_ctx = NUM2INT(value);
-  return value;
-}
-
-void Init_whisper() {
-  mWhisper = rb_define_module("Whisper");
-  cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
-  cParams  = rb_define_class_under(mWhisper, "Params", rb_cObject);
-
-  rb_define_alloc_func(cContext, ruby_whisper_allocate);
-  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
-
-  rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
-
-  rb_define_alloc_func(cParams, ruby_whisper_params_allocate);
-
-  rb_define_method(cParams, "language=", ruby_whisper_params_set_language, 1);
-  rb_define_method(cParams, "language", ruby_whisper_params_get_language, 0);
-  rb_define_method(cParams, "translate=", ruby_whisper_params_set_translate, 1);
-  rb_define_method(cParams, "translate", ruby_whisper_params_get_translate, 0);
-  rb_define_method(cParams, "no_context=", ruby_whisper_params_set_no_context, 1);
-  rb_define_method(cParams, "no_context", ruby_whisper_params_get_no_context, 0);
-  rb_define_method(cParams, "single_segment=", ruby_whisper_params_set_single_segment, 1);
-  rb_define_method(cParams, "single_segment", ruby_whisper_params_get_single_segment, 0);
-  rb_define_method(cParams, "print_special", ruby_whisper_params_get_print_special, 0);
-  rb_define_method(cParams, "print_special=", ruby_whisper_params_set_print_special, 1);
-  rb_define_method(cParams, "print_progress", ruby_whisper_params_get_print_progress, 0);
-  rb_define_method(cParams, "print_progress=", ruby_whisper_params_set_print_progress, 1);
-  rb_define_method(cParams, "print_realtime", ruby_whisper_params_get_print_realtime, 0);
-  rb_define_method(cParams, "print_realtime=", ruby_whisper_params_set_print_realtime, 1);
-  rb_define_method(cParams, "print_timestamps", ruby_whisper_params_get_print_timestamps, 0);
-  rb_define_method(cParams, "print_timestamps=", ruby_whisper_params_set_print_timestamps, 1);
-  rb_define_method(cParams, "suppress_blank", ruby_whisper_params_get_suppress_blank, 0);
-  rb_define_method(cParams, "suppress_blank=", ruby_whisper_params_set_suppress_blank, 1);
-  rb_define_method(cParams, "suppress_non_speech_tokens", ruby_whisper_params_get_suppress_non_speech_tokens, 0);
-  rb_define_method(cParams, "suppress_non_speech_tokens=", ruby_whisper_params_set_suppress_non_speech_tokens, 1);
-  rb_define_method(cParams, "token_timestamps", ruby_whisper_params_get_token_timestamps, 0);
-  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
-  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
-  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
-  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
-  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
-
-  rb_define_method(cParams, "offset", ruby_whisper_params_get_offset, 0);
-  rb_define_method(cParams, "offset=", ruby_whisper_params_set_offset, 1);
-  rb_define_method(cParams, "duration", ruby_whisper_params_get_duration, 0);
-  rb_define_method(cParams, "duration=", ruby_whisper_params_set_duration, 1);
-
-  rb_define_method(cParams, "max_text_tokens", ruby_whisper_params_get_max_text_tokens, 0);
-  rb_define_method(cParams, "max_text_tokens=", ruby_whisper_params_set_max_text_tokens, 1);
-}
-#ifdef __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -1,15 +0,0 @@
-#ifndef __RUBY_WHISPER_H
-#define __RUBY_WHISPER_H
-
-#include "whisper.h"
-
-typedef struct {
-  struct whisper_context *context;
-} ruby_whisper;
-
-typedef struct {
-  struct whisper_full_params params;
-  bool diarize;
-} ruby_whisper_params;
-
-#endif
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -1,138 +0,0 @@
-TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
-EXTDIR = File.join(TOPDIR, 'ext')
-#$LIBDIR = File.join(TOPDIR, 'lib')
-#$:.unshift(LIBDIR)
-$:.unshift(EXTDIR)
-
-require 'whisper'
-require 'test/unit'
-
-class TestWhisper < Test::Unit::TestCase
-  def setup
-    @params  = Whisper::Params.new
-  end
-
-  def test_language
-    @params.language = "en"
-    assert_equal @params.language, "en"
-    @params.language = "auto"
-    assert_equal @params.language, "auto"
-  end
-
-  def test_offset
-    @params.offset = 10_000
-    assert_equal @params.offset, 10_000
-    @params.offset = 0
-    assert_equal @params.offset, 0
-  end
-
-  def test_duration
-    @params.duration = 60_000
-    assert_equal @params.duration, 60_000
-    @params.duration = 0
-    assert_equal @params.duration, 0
-  end
-
-  def test_max_text_tokens
-    @params.max_text_tokens = 300
-    assert_equal @params.max_text_tokens, 300
-    @params.max_text_tokens = 0
-    assert_equal @params.max_text_tokens, 0
-  end
-
-  def test_translate
-    @params.translate = true
-    assert @params.translate
-    @params.translate = false
-    assert !@params.translate
-  end
-
-  def test_no_context
-    @params.no_context = true
-    assert @params.no_context
-    @params.no_context = false
-    assert !@params.no_context
-  end
-
-  def test_single_segment
-    @params.single_segment = true
-    assert @params.single_segment
-    @params.single_segment = false
-    assert !@params.single_segment
-  end
-
-  def test_print_special
-    @params.print_special = true
-    assert @params.print_special
-    @params.print_special = false
-    assert !@params.print_special
-  end
-
-  def test_print_progress
-    @params.print_progress = true
-    assert @params.print_progress
-    @params.print_progress = false
-    assert !@params.print_progress
-  end
-
-  def test_print_realtime
-    @params.print_realtime = true
-    assert @params.print_realtime
-    @params.print_realtime = false
-    assert !@params.print_realtime
-  end
-
-  def test_print_timestamps
-    @params.print_timestamps = true
-    assert @params.print_timestamps
-    @params.print_timestamps = false
-    assert !@params.print_timestamps
-  end
-
-  def test_suppress_blank
-    @params.suppress_blank = true
-    assert @params.suppress_blank
-    @params.suppress_blank = false
-    assert !@params.suppress_blank
-  end
-
-  def test_suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = true
-    assert @params.suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = false
-    assert !@params.suppress_non_speech_tokens
-  end
-
-  def test_token_timestamps
-    @params.token_timestamps = true
-    assert @params.token_timestamps
-    @params.token_timestamps = false
-    assert !@params.token_timestamps
-  end
-
-  def test_split_on_word
-    @params.split_on_word = true
-    assert @params.split_on_word
-    @params.split_on_word = false
-    assert !@params.split_on_word
-  end
-
-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
-  def test_whisper
-    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
-    params  = Whisper::Params.new
-    params.print_timestamps = false
-
-    jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
-    @whisper.transcribe(jfk, params) {|text|
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, text
-    }
-  end
-
-end
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -1,17 +0,0 @@
-# Set the default compile features and properties for a target.
-
-if (NOT TARGET)
-    message(FATAL_ERROR "TARGET not set before including DefaultTargetOptions")
-endif()
-
-target_compile_features(${TARGET}
-    PRIVATE
-        cxx_std_11
-    )
-
-set_target_properties(${TARGET}
-    PROPERTIES
-        EXPORT_COMPILE_COMMANDS ON
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
-)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,37 +14,6 @@ if (WHISPER_SUPPORT_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

-# common
-
-set(TARGET common)
-
-add_library(${TARGET} STATIC
-    common.h
-    common.cpp
-    )
-
-include(DefaultTargetOptions)
-
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-if (WHISPER_SUPPORT_SDL2)
-    # common-sdl
-
-    set(TARGET common-sdl)
-
-    add_library(${TARGET} STATIC
-        common-sdl.h
-        common-sdl.cpp
-        )
-
-    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
-
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@ -55,13 +24,10 @@ if (EMSCRIPTEN)
    add_subdirectory(command.wasm)
    add_subdirectory(talk.wasm)
    add_subdirectory(bench.wasm)
-elseif(CMAKE_JS_VERSION)
-    add_subdirectory(addon.node)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
-    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/.gitignore
+++ b/examples/addon.node/.gitignore
@ -1,3 +0,0 @@
-.idea
-node_modules
-build
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -1,31 +0,0 @@
-set(TARGET whisper-addon)
-
-# Base settings
-#==================================================================
-# env var supported by cmake-js
-add_definitions(-DNAPI_VERSION=4)
-include_directories(${CMAKE_JS_INC})
-#==================================================================
-
-add_library(${TARGET} SHARED ${CMAKE_JS_SRC} addon.cpp)
-set_target_properties(${TARGET} PROPERTIES PREFIX "" SUFFIX ".node")
-
-include(DefaultTargetOptions)
-
-# Include N-API wrappers
-#==================================================================
-execute_process(COMMAND node -p "require('node-addon-api').include"
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE NODE_ADDON_API_DIR
-        )
-string(REPLACE "\n" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
-string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
-target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
-#==================================================================
-
-target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
-
-if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-    # Generate node.lib
-    execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
-endif()
--- a/examples/addon.node/README.md
+++ b/examples/addon.node/README.md
@ -1,37 +0,0 @@
-# addon
-
-This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
-It can be used as a reference for using the whisper.cpp project in other node projects.
-
-## Install
-
-```shell
-npm install
-```
-
-## Compile
-
-Make sure it is in the project root directory and compiled with make-js.
-
-```shell
-npx cmake-js compile -T whisper-addon -B Release
-```
-
-For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
-
-> Such as appointing special cmake path:
-> ```shell
-> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
-> ```
-
-## Run
-
-```shell
-cd examples/addon.node
-
-node index.js --language='language' --model='model-path' --fname_inp='file-path'
-```
-
-Because this is a simple Demo, only the above parameters are set in the node environment.
-
-Other parameters can also be specified in the node environment.
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -1,22 +0,0 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
-
-const whisperParamsMock = {
-  language: "en",
-  model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
-  fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
-};
-
-describe("Run whisper.node", () => {
-  test("it should receive a non-empty value", async () => {
-    let result = await whisperAsync(whisperParamsMock);
-
-    expect(result.length).toBeGreaterThan(0);
-  });
-});
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -1,338 +0,0 @@
-#include "napi.h"
-#include "common.h"
-
-#include "whisper.h"
-
-#include <string>
-#include <thread>
-#include <vector>
-#include <cmath>
-#include <cstdint>
-
-struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors = 1;
-    int32_t offset_t_ms  = 0;
-    int32_t offset_n     = 0;
-    int32_t duration_ms  = 0;
-    int32_t max_context  = -1;
-    int32_t max_len      = 0;
-    int32_t best_of      = 5;
-    int32_t beam_size    = -1;
-
-    float word_thold    = 0.01f;
-    float entropy_thold = 2.4f;
-    float logprob_thold = -1.0f;
-
-    bool speed_up       = false;
-    bool translate      = false;
-    bool diarize        = false;
-    bool output_txt     = false;
-    bool output_vtt     = false;
-    bool output_srt     = false;
-    bool output_wts     = false;
-    bool output_csv     = false;
-    bool print_special  = false;
-    bool print_colors   = false;
-    bool print_progress = false;
-    bool no_timestamps  = false;
-
-    std::string language = "en";
-    std::string prompt;
-    std::string model    = "../../ggml-large.bin";
-
-    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
-};
-
-struct whisper_print_user_data {
-    const whisper_params * params;
-
-    const std::vector<std::vector<float>> * pcmf32s;
-};
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
-    const auto & params  = *((whisper_print_user_data *) user_data)->params;
-    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-
-    std::string speaker = "";
-
-    int64_t t0;
-    int64_t t1;
-
-    // print the last n_new segments
-    const int s0 = n_segments - n_new;
-
-    if (s0 == 0) {
-        printf("\n");
-    }
-
-    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
-
-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            const int64_t n_samples = pcmf32s[0].size();
-
-            const int64_t is0 = timestamp_to_sample(t0, n_samples);
-            const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-            double energy0 = 0.0f;
-            double energy1 = 0.0f;
-
-            for (int64_t j = is0; j < is1; j++) {
-                energy0 += fabs(pcmf32s[0][j]);
-                energy1 += fabs(pcmf32s[1][j]);
-            }
-
-            if (energy0 > 1.1*energy1) {
-                speaker = "(speaker 0)";
-            } else if (energy1 > 1.1*energy0) {
-                speaker = "(speaker 1)";
-            } else {
-                speaker = "(speaker ?)";
-            }
-
-            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
-        }
-
-        // colorful print bug
-        //
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        printf("%s%s", speaker.c_str(), text);
-
-
-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
-
-        fflush(stdout);
-    }
-}
-
-int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
-    if (params.fname_inp.empty()) {
-        fprintf(stderr, "error: no input files specified\n");
-        return 2;
-    }
-
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
-
-    if (ctx == nullptr) {
-        fprintf(stderr, "error: failed to initialize whisper context\n");
-        return 3;
-    }
-
-    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
-        const auto fname_inp = params.fname_inp[f];
-        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
-
-        std::vector<float> pcmf32; // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
-        }
-
-        // print system information
-        {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-        }
-
-        // print some info about the processing
-        {
-            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
-            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
-                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.no_timestamps ? 0 : 1);
-
-            fprintf(stderr, "\n");
-        }
-
-        // run the inference
-        {
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-
-            wparams.token_timestamps = params.output_wts || params.max_len > 0;
-            wparams.thold_pt         = params.word_thold;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-
-            wparams.speed_up         = params.speed_up;
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.initial_prompt   = params.prompt.c_str();
-
-            whisper_print_user_data user_data = { &params, &pcmf32s };
-
-            // this callback is called on each new segment
-            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
-            }
-
-            // example for abort mechanism
-            // in this example, we do not abort the processing, but we could if the flag is set to true
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                fprintf(stderr, "failed to process audio\n");
-                return 10;
-            }
-        }
-    }
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    result.resize(n_segments);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        result[i].emplace_back(to_timestamp(t0, true));
-        result[i].emplace_back(to_timestamp(t1, true));
-        result[i].emplace_back(text);
-    }
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}
-
-class Worker : public Napi::AsyncWorker {
- public:
-  Worker(Napi::Function& callback, whisper_params params)
-      : Napi::AsyncWorker(callback), params(params) {}
-
-  void Execute() override {
-    run(params, result);
-  }
-
-  void OnOK() override {
-    Napi::HandleScope scope(Env());
-    Napi::Object res = Napi::Array::New(Env(), result.size());
-    for (uint64_t i = 0; i < result.size(); ++i) {
-      Napi::Object tmp = Napi::Array::New(Env(), 3);
-      for (uint64_t j = 0; j < 3; ++j) {
-        tmp[j] = Napi::String::New(Env(), result[i][j]);
-      }
-      res[i] = tmp;
-    }
-    Callback().Call({Env().Null(), res});
-  }
-
- private:
-  whisper_params params;
-  std::vector<std::vector<std::string>> result;
-};
-
-
-
-Napi::Value whisper(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  if (info.Length() <= 0 || !info[0].IsObject()) {
-    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
-  }
-  whisper_params params;
-
-  Napi::Object whisper_params = info[0].As<Napi::Object>();
-  std::string language = whisper_params.Get("language").As<Napi::String>();
-  std::string model = whisper_params.Get("model").As<Napi::String>();
-  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
-
-  params.language = language;
-  params.model = model;
-  params.fname_inp.emplace_back(input);
-
-  Napi::Function callback = info[1].As<Napi::Function>();
-  Worker* worker = new Worker(callback, params);
-  worker->Queue();
-  return env.Undefined();
-}
-
-
-Napi::Object Init(Napi::Env env, Napi::Object exports) {
-  exports.Set(
-      Napi::String::New(env, "whisper"),
-      Napi::Function::New(env, whisper)
-  );
-  return exports;
-}
-
-NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,36 +0,0 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
-
-const whisperParams = {
-  language: "en",
-  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-  fname_inp: "../../samples/jfk.wav",
-};
-
-const arguments = process.argv.slice(2);
-const params = Object.fromEntries(
-  arguments.reduce((pre, item) => {
-    if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
-    }
-    return pre;
-  }, [])
-);
-
-for (const key in params) {
-  if (whisperParams.hasOwnProperty(key)) {
-    whisperParams[key] = params[key];
-  }
-}
-
-console.log("whisperParams =", whisperParams);
-
-whisperAsync(whisperParams).then((result) => {
-  console.log(`Result from whisper: ${result}`);
-});
--- a/examples/addon.node/package.json
+++ b/examples/addon.node/package.json
@ -1,16 +0,0 @@
-{
-  "name": "whisper-addon",
-  "version": "0.0.0",
-  "description": "",
-  "main": "index.js",
-  "author": "Qanhe Chen",
-  "license": "MIT",
-  "scripts": {
-    "test": "jest"
-  },
-  "devDependencies": {
-    "cmake-js": "^7.1.1",
-    "jest": "^29.4.0",
-    "node-addon-api": "^5.0.0"
-  }
-}
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -8,8 +8,6 @@ add_executable(${TARGET}
    emscripten.cpp
    )

-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@ -28,11 +28,6 @@ void bench_main(size_t index) {
        return;
    }

-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
-    }
-
    if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return;
@ -57,7 +52,7 @@ EMSCRIPTEN_BINDINGS(bench) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file(path_model.c_str());
+                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    if (g_worker.joinable()) {
                        g_worker.join();
--- a/examples/bench/CMakeLists.txt
+++ b/examples/bench/CMakeLists.txt
@ -1,6 +1,3 @@
 set(TARGET bench)
 add_executable(${TARGET} bench.cpp)
-
-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -7,7 +7,6 @@
 // command-line parameters
 struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat

    std::string model = "models/ggml-base.en.bin";
 };
@ -24,7 +23,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        }
        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
-        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -43,17 +41,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
-    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
-    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
-    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
 }

-int whisper_bench_encoder(const whisper_params & params) {
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
    // whisper init

-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+    struct whisper_context * ctx = whisper_init(params.model.c_str());

    {
        fprintf(stderr, "\n");
@ -92,22 +92,3 @@ int whisper_bench_encoder(const whisper_params & params) {

    return 0;
 }
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    int ret = -1;
-
-    switch (params.what) {
-        case 0: ret = whisper_bench_encoder(params);                break;
-        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
-        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
-        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
-    }
-
-    return ret;
-}
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -8,10 +8,7 @@ add_executable(${TARGET}
    emscripten.cpp
    )

-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE
-    common
    whisper
    )

--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -1,5 +1,4 @@
 #include "ggml.h"
-#include "common.h"
 #include "whisper.h"

 #include <emscripten.h>
@ -28,6 +27,24 @@ std::string g_transcribed   = "";

 std::vector<float> g_pcmf32;

+static std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
@ -58,6 +75,44 @@ void command_set_status(const std::string & status) {
    g_status = status;
 }

+bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -100,7 +155,7 @@ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
    const int64_t n_samples = (ms * sample_rate) / 1000;

    int64_t n_take = 0;
-    if (n_samples > (int) g_pcmf32.size()) {
+    if (g_pcmf32.size() < n_samples) {
        n_take = g_pcmf32.size();
    } else {
        n_take = n_samples;
@ -132,6 +187,7 @@ void command_main(size_t index) {

    printf("command: using %d threads\n", wparams.n_threads);

+    bool is_running   = true;
    bool have_prompt  = false;
    bool ask_prompt   = true;
    bool print_energy = false;
@ -177,7 +233,7 @@ void command_main(size_t index) {
        {
            command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
+            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                command_set_status("Speech detected! Processing ...");

@ -268,7 +324,7 @@ EMSCRIPTEN_BINDINGS(command) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file(path_model.c_str());
+                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -2,8 +2,6 @@ if (WHISPER_SUPPORT_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
-
-    include(DefaultTargetOptions)
-
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -9,19 +9,7 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/

 # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
 ./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
-```

-https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
-
-Web version: [examples/command.wasm](/examples/command.wasm)
-
-## Guided mode
-
-"Guided mode" allows you to specify a list of commands (i.e. strings) and the transcription will be guided to classify your command into one from the list. This can be useful in situations where a device is listening only for a small subset of commands.
-
-Initial tests show that this approach might be extremely efficient in terms of performance, since it integrates very well with the "partial Encoder" idea from #137.
-
-```bash
 # Run in guided mode, the list of allowed commands is in commands.txt
 ./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt

@ -29,8 +17,9 @@ Initial tests show that this approach might be extremely efficient in terms of p
 ./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
 ```

-https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
+https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4

+Web version: [examples/command.wasm](/examples/command.wasm)

 ## Building

--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -1,226 +0,0 @@
-#include "common-sdl.h"
-
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-
-    m_running = false;
-}
-
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-    }
-
-    m_sample_rate = capture_spec_obtained.freq;
-
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-
-    return true;
-}
-
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-
-    m_running = true;
-
-    return true;
-}
-
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-
-    m_running = false;
-
-    return true;
-}
-
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-
-    return true;
-}
-
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-
-    const size_t n_samples = len / sizeof(float);
-
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-
-    result.clear();
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
-        result.resize(n_samples);
-
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-
-bool sdl_poll_events() {
-    SDL_Event event;
-    while (SDL_PollEvent(&event)) {
-        switch (event.type) {
-            case SDL_QUIT:
-                {
-                    return false;
-                } break;
-            default:
-                break;
-        }
-    }
-
-    return true;
-}
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -1,50 +0,0 @@
-#pragma once
-
-#include <SDL.h>
-#include <SDL_audio.h>
-
-#include <atomic>
-#include <cstdint>
-#include <vector>
-#include <mutex>
-
-//
-// SDL Audio capture
-//
-
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-
-    bool init(int capture_id, int sample_rate);
-
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-
-    std::atomic_bool m_running;
-    std::mutex       m_mutex;
-
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-
-// Return false if need to quit
-bool sdl_poll_events();
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -1,162 +0,0 @@
-#include "common.h"
-
-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
-#include <cmath>
-#include <regex>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-
-bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname == "-") {
-        {
-            uint8_t buf[1024];
-            while (true)
-            {
-                const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                if (n == 0) {
-                    break;
-                }
-                wav_data.insert(wav_data.end(), buf, buf + n);
-            }
-        }
-
-        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from stdin\n");
-            return false;
-        }
-
-        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    }
-    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-        return false;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (stereo && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
-        return false;
-    }
-
-    if (wav.bitsPerSample != 16) {
-        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-        return false;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (stereo) {
-        // convert to stereo, float
-        pcmf32s.resize(2);
-
-        pcmf32s[0].resize(n);
-        pcmf32s[1].resize(n);
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-        }
-    }
-
-    return true;
-}
-
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-
-    float y = data[0];
-
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-
-    return true;
-}
--- a/examples/common.h
+++ b/examples/common.h
@ -1,40 +0,0 @@
-#pragma once
-
-// needs to match WHISPER_SAMPLE_RATE
-#define COMMON_SAMPLE_RATE 16000
-
-#include <vector>
-#include <string>
-
-std::string trim(const std::string & s);
-
-std::string replace(
-        const std::string & s,
-        const std::string & from,
-        const std::string & to);
-
-// Read WAV audio file and store the PCM data into pcmf32
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
-bool read_wav(
-        const std::string & fname,
-        std::vector<float> & pcmf32,
-        std::vector<std::vector<float>> & pcmf32s,
-        bool stereo);
-
-// Apply a high-pass frequency filter to PCM audio
-// Suppresses frequencies below cutoff Hz
-void high_pass_filter(
-        std::vector<float> & data,
-        float cutoff,
-        float sample_rate);
-
-// Basic voice activity detection (VAD) using audio energy adaptive threshold
-bool vad_simple(
-        std::vector<float> & pcmf32,
-        int   sample_rate,
-        int   last_ms,
-        float vad_thold,
-        float freq_thold,
-        bool  verbose);
-
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -8,7 +8,7 @@ function convertTypedArray(src, type) {

 var printTextarea = (function() {
    var element = document.getElementById('output');
-    if (element) element.value = ''; // clear browser cache
+    if (element) element.alue = ''; // clear browser cache
    return function(text) {
        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
        console.log(text);
@ -88,15 +88,11 @@ async function fetchRemote(url, cbProgress, cbPrint) {
 // - check if the data is already in the IndexedDB
 // - if not, fetch it from the remote URL and store it in the IndexedDB
 function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
-    if (!navigator.storage || !navigator.storage.estimate) {
-        cbPrint('loadRemote: navigator.storage.estimate() is not supported');
-    } else {
-        // query the storage quota and print it
-        navigator.storage.estimate().then(function (estimate) {
-            cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
-            cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
-        });
-    }
+    // query the storage quota and print it
+    navigator.storage.estimate().then(function (estimate) {
+        cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
+        cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
+    });

    // check if the data is already in the IndexedDB
    var rq = indexedDB.open(dbName, dbVersion);
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -100,7 +100,7 @@ while [ $running -eq 1 ]; do
        err=$(cat /tmp/whisper-live.err | wc -l)
    done

-    ./main -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
+    ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1

    while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
        sleep 1
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -1,6 +1,3 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
-
-include(DefaultTargetOptions)
-
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -9,36 +9,25 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,        --help              [default] show this help message and exit
-  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
-  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -otxt,     --output-txt        [false  ] output result in a text file
-  -ovtt,     --output-vtt        [false  ] output result in a vtt file
-  -osrt,     --output-srt        [false  ] output result in a srt file
-  -owts,     --output-words      [false  ] output script for generating karaoke video
-  -ocsv,     --output-csv        [false  ] output result in a CSV file
-  -oj,       --output-json       [false  ] output result in a JSON file
-  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
-  -ps,       --print-special     [false  ] print special tokens
-  -pc,       --print-colors      [false  ] print colors
-  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [true   ] do not print timestamps
-  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-             --prompt PROMPT     [       ] initial prompt
-  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
-  -f FNAME,  --file FNAME        [       ] input WAV file path
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,14 +1,16 @@
-#include "common.h"
-
 #include "whisper.h"

+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
 #include <cmath>
 #include <fstream>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
-#include <cstring>

 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
@ -51,30 +53,22 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
+    int32_t n_processors = 1;
+    int32_t offset_t_ms  = 0;
+    int32_t offset_n     = 0;
+    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      =  5;
-    int32_t beam_size    = -1;
+    int32_t max_len      = 0;

-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
+    float word_thold = 0.01f;

    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
-    bool split_on_word  = false;
-    bool no_fallback    = false;
    bool output_txt     = false;
    bool output_vtt     = false;
    bool output_srt     = false;
    bool output_wts     = false;
-    bool output_csv     = false;
-    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -82,11 +76,9 @@ struct whisper_params {

    std::string language = "en";
    std::string prompt;
-    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -95,11 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-"){
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
@ -116,24 +103,14 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-d"    || arg == "--duration")       { params.duration_ms    = std::stoi(argv[++i]); }
        else if (arg == "-mc"   || arg == "--max-context")    { params.max_context    = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")        { params.max_len        = std::stoi(argv[++i]); }
-        else if (arg == "-bo"   || arg == "--best-of")        { params.best_of        = std::stoi(argv[++i]); }
-        else if (arg == "-bs"   || arg == "--beam-size")      { params.beam_size      = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")     { params.word_thold     = std::stof(argv[++i]); }
-        else if (arg == "-et"   || arg == "--entropy-thold")  { params.entropy_thold  = std::stof(argv[++i]); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")  { params.logprob_thold  = std::stof(argv[++i]); }
        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")  { params.split_on_word  = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
-        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
-        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
-        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@ -157,40 +134,30 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
-    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
-    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
+    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N   [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N     [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N     [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N     [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N  [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,    --max-len N      [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -wt N,    --word-thold N   [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,      --diarize        [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -otxt,    --output-txt     [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,    --output-vtt     [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,    --output-srt     [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -owts,    --output-words   [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,      --print-colors   [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -pp,      --print-progress [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,      --no-timestamps  [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "            --prompt PROMPT  [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME    [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }

@ -200,87 +167,96 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

    const int n_segments = whisper_full_n_segments(ctx);

-    std::string speaker = "";
-
-    int64_t t0;
-    int64_t t1;
-
    // print the last n_new segments
    const int s0 = n_segments - n_new;
-
    if (s0 == 0) {
        printf("\n");
    }

    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
-
-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            const int64_t n_samples = pcmf32s[0].size();
-
-            const int64_t is0 = timestamp_to_sample(t0, n_samples);
-            const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-            double energy0 = 0.0f;
-            double energy1 = 0.0f;
-
-            for (int64_t j = is0; j < is1; j++) {
-                energy0 += fabs(pcmf32s[0][j]);
-                energy1 += fabs(pcmf32s[1][j]);
-            }
-
-            if (energy0 > 1.1*energy1) {
-                speaker = "(speaker 0)";
-            } else if (energy1 > 1.1*energy0) {
-                speaker = "(speaker 1)";
-            } else {
-                speaker = "(speaker ?)";
-            }
-
-            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
-        }
-
-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
+        if (params.no_timestamps) {
+            if (params.print_colors) {
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
                    }
+
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                }
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+                printf("%s", text);
+            }
+            fflush(stdout);
+        } else {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+            std::string speaker;
+
+            if (params.diarize && pcmf32s.size() == 2) {
+                const int64_t n_samples = pcmf32s[0].size();
+
+                const int64_t is0 = timestamp_to_sample(t0, n_samples);
+                const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+                double energy0 = 0.0f;
+                double energy1 = 0.0f;
+
+                for (int64_t j = is0; j < is1; j++) {
+                    energy0 += fabs(pcmf32s[0][j]);
+                    energy1 += fabs(pcmf32s[1][j]);
                }

-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+                if (energy0 > 1.1*energy1) {
+                    speaker = "(speaker 0)";
+                } else if (energy1 > 1.1*energy0) {
+                    speaker = "(speaker 1)";
+                } else {
+                    speaker = "(speaker ?)";
+                }

-                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
            }
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);

-            printf("%s%s", speaker.c_str(), text);
+            if (params.print_colors) {
+                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
+                    }
+
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+                }
+                printf("\n");
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+
+                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
+            }
        }
-
-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
-
-        fflush(stdout);
    }
 }

@ -349,202 +325,16 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-bool output_csv(struct whisper_context * ctx, const char * fname) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    fout << "start,end,text\n";
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
-    }
-
-    return true;
-}
-
-char *escape_double_quotes(const char *str) {
-    if (str == NULL) {
-        return NULL;
-    }
-
-    size_t escaped_length = strlen(str) + 1;
-
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped_length++;
-        }
-    }
-
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
-    if (escaped == NULL) {
-        return NULL;
-    }
-
-    size_t pos = 0;
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped[pos++] = '\\';
-            escaped[pos++] = '"';
-        } else {
-            escaped[pos++] = str[i];
-        }
-    }
-
-    // no need to set zero due to calloc() being used prior
-
-    return escaped;
-}
-
-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "},\n");
-    };
-
-    auto start_obj = [&](const char *name = nullptr) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end = false) {
-        start_value(name);
-        char * val_escaped = escape_double_quotes(val);
-        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
-        free(val_escaped);
-    };
-
-    auto end_value = [&](bool end = false) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end = false) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj();
-        value_s("systeminfo", whisper_print_system_info());
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx));
-            value_b("multilingual", whisper_is_multilingual(ctx));
-            value_i("vocab", whisper_model_n_vocab(ctx));
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx));
-                value_i("state", whisper_model_n_audio_state(ctx));
-                value_i("head", whisper_model_n_audio_head(ctx));
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj();
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx));
-                value_i("state", whisper_model_n_text_state(ctx));
-                value_i("head", whisper_model_n_text_head(ctx));
-                value_i("layer", whisper_model_n_text_layer(ctx), true);
-            end_obj();
-            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
-        end_obj();
-        start_obj("params");
-            value_s("model", params.model.c_str());
-            value_s("language", params.language.c_str());
-            value_b("translate", params.translate, true);
-        end_obj();
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj();
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj();
-                    start_obj("timestamps");
-                        value_s("from", to_timestamp(t0, true).c_str());
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj();
-                    start_obj("offsets");
-                        value_i("from", t0 * 10);
-                        value_i("to", t1 * 10, true);
-                    end_obj();
-                    value_s("text", text, true);
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

-    static const char * font = params.font_path.c_str();
-
-    std::ifstream fin(font);
-    if (!fin.is_open()) {
-        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
-        return false;
-    }
+    // TODO: become parameter
+    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";

    fout << "#!/bin/bash" << "\n";
    fout << "\n";
@ -668,23 +458,115 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+    struct whisper_context * ctx = whisper_init(params.model.c_str());

    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 3;
    }

+    // initial prompt
+    std::vector<whisper_token> prompt_tokens;
+
+    if (!params.prompt.empty()) {
+        prompt_tokens.resize(1024);
+        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
+        fprintf(stderr, "initial tokens: [ ");
+        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
+            fprintf(stderr, "%d ", prompt_tokens[i]);
+        }
+        fprintf(stderr, "]\n");
+    }
+
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];

-        std::vector<float> pcmf32;               // mono-channel F32 PCM
+        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
+        // WAV input
+        {
+            drwav wav;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+            if (fname_inp == "-") {
+                {
+                    uint8_t buf[1024];
+                    while (true)
+                    {
+                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                        if (n == 0) {
+                            break;
+                        }
+                        wav_data.insert(wav_data.end(), buf, buf + n);
+                    }
+                }
+
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
+                    return 4;
+                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+            }
+            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
+                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
+                return 5;
+            }
+
+            if (wav.channels != 1 && wav.channels != 2) {
+                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                return 8;
+            }
+
+            if (wav.bitsPerSample != 16) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
+                return 9;
+            }
+
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+
+            std::vector<int16_t> pcm16;
+            pcm16.resize(n*wav.channels);
+            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+            drwav_uninit(&wav);
+
+            // convert to mono, float
+            pcmf32.resize(n);
+            if (wav.channels == 1) {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[i])/32768.0f;
+                }
+            } else {
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+                }
+            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (uint64_t i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -718,8 +600,6 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
            wparams.print_timestamps = !params.no_timestamps;
@ -734,18 +614,11 @@ int main(int argc, char ** argv) {
            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.split_on_word    = params.split_on_word;

            wparams.speed_up         = params.speed_up;

-            wparams.initial_prompt   = params.prompt.c_str();
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
+            wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();

            whisper_print_user_data user_data = { &params, &pcmf32s };

@ -761,7 +634,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -780,39 +653,27 @@ int main(int argc, char ** argv) {

            // output to text file
            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
+                const auto fname_txt = fname_inp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }

            // output to VTT file
            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
+                const auto fname_vtt = fname_inp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }

            // output to SRT file
            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
+                const auto fname_srt = fname_inp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }

            // output to WTS file
            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
+                const auto fname_wts = fname_inp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
-
-            // output to CSV file
-            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
-                output_csv(ctx, fname_csv.c_str());
-            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params);
-            }
        }
    }

--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -8,8 +8,6 @@ add_executable(${TARGET}
    emscripten.cpp
    )

-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -49,9 +49,6 @@ void stream_main(size_t index) {
    wparams.max_tokens       = 32;
    wparams.audio_ctx        = 768; // partial encoder context for better performance

-    // disable temperature fallback
-    wparams.temperature_inc  = -1.0f;
-
    wparams.language         = "en";

    printf("stream: using %d threads\n", wparams.n_threads);
@ -132,7 +129,7 @@ EMSCRIPTEN_BINDINGS(stream) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file(path_model.c_str());
+                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -2,8 +2,6 @@ if (WHISPER_SUPPORT_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
-
-    include(DefaultTargetOptions)
-
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -3,16 +3,18 @@
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //

-#include "common.h"
-#include "common-sdl.h"
 #include "whisper.h"

+#include <SDL.h>
+#include <SDL_audio.h>
+
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
+#include <mutex>

 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -113,6 +115,304 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
    whisper_params params;

@ -120,21 +420,20 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
-    params.length_ms = std::max(params.length_ms, params.step_ms);
+    params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms

-    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
+    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (params.keep_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (30000           *1e-3)*WHISPER_SAMPLE_RATE;
+
+    const int n_new_line = params.length_ms / params.step_ms - 1; // number of steps to print new line

    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD

-    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
-
-    params.no_timestamps  = !use_vad;
-    params.no_context    |= use_vad;
-    params.max_tokens     = 0;
+    params.no_timestamps = !use_vad;
+    params.no_context    = use_vad;
+    params.max_tokens    = 0;

    // init audio

@ -154,10 +453,10 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+    struct whisper_context * ctx = whisper_init(params.model.c_str());

    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
-    std::vector<float> pcmf32_old;
+    std::vector<float> pcmf32_old(n_samples_30s, 0.0f);
    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);

    std::vector<whisper_token> prompt_tokens;
@ -184,7 +483,7 @@ int main(int argc, char ** argv) {
                params.no_timestamps ? 0 : 1);

        if (!use_vad) {
-            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
+            fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
        } else {
            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
        }
@ -214,7 +513,23 @@ int main(int argc, char ** argv) {
    // main audio loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }

        if (!is_running) {
            break;
@ -237,7 +552,7 @@ int main(int argc, char ** argv) {
                    break;
                }

-                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                SDL_Delay(1);
            }

            const int n_samples_new = pcmf32_new.size();
@ -268,7 +583,7 @@ int main(int argc, char ** argv) {

            audio.get(2000, pcmf32_new);

-            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                audio.get(params.length_ms, pcmf32);
            } else {
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -288,6 +603,7 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
+            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
@ -296,9 +612,6 @@ int main(int argc, char ** argv) {
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

-            // disable temperature fallback
-            wparams.temperature_inc  = -1.0f;
-
            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();

--- a/examples/talk-llama/.gitignore
+++ b/examples/talk-llama/.gitignore
@ -1,2 +0,0 @@
-eleven-labs.py
-audio.mp3
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,16 +0,0 @@
-if (WHISPER_SUPPORT_SDL2)
-    # talk-llama
-    set(TARGET talk-llama)
-    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
-
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    include(DefaultTargetOptions)
-endif ()
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -1,36 +0,0 @@
-# talk-llama
-
-Talk with an LLaMA AI in your terminal
-
-[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
-
-## Building
-
-The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
-
-```bash
-# Install SDL2 on Linux
-sudo apt-get install libsdl2-dev
-
-# Install SDL2 on Mac OS
-brew install sdl2
-
-# Build the "talk-llama" executable
-make talk-llama
-
-# Run it
-./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
-```
-
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
-
-## TTS
-
-For best experience, this example needs a TTS tool to convert the generated text responses to voice.
-You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
-By default, it is configured to use MacOS's `say`, but you can use whatever you wish.
-
-## Discussion
-
-If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -1,152 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-
-        float p;     // probability of the token
-        float plog;  // log probability of the token
-
-    } llama_token_data;
-
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-                   int   itype);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/examples/talk-llama/prompts/talk-alpaca.txt
+++ b/examples/talk-llama/prompts/talk-alpaca.txt
@ -1,23 +0,0 @@
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-
-Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{1} responds with short and concise answers.
-
-### Response:
-
-{0}{4} Hello, {1}!
-{1}{4} Hello {0}! How may I help you today?
-{0}{4} What time is it?
-{1}{4} It is {2} o'clock.
-{0}{4} What year is it?
-{1}{4} We are in {3}.
-{0}{4} What is a cat?
-{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-{0}{4} Name a color.
-{1}{4} Blue
-{0}{4}
--- a/examples/talk-llama/speak.sh
+++ b/examples/talk-llama/speak.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# for Mac
-if [ "$1" = "0" ]; then
-    say "$2"
-elif [ "$1" = "1" ]; then
-    say -v "Samantha (Enhanced)" "$2"
-elif [ "$1" = "2" ]; then
-    say -v "Daniel (Enhanced)" "$2"
-elif [ "$1" = "3" ]; then
-    say -v "Veena (Enhanced)" "$2"
-fi
-
-# Eleven Labs
-#
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2" >/dev/null 2>&1
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -1,703 +0,0 @@
-// Talk with AI
-//
-
-#include "common.h"
-#include "common-sdl.h"
-#include "whisper.h"
-#include "llama.h"
-
-#include <map>
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_id   = 0;
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 64;
-    int32_t audio_ctx  = 0;
-
-    int32_t n_parts_llama = -1;
-
-    float vad_thold    = 0.4f;
-    float freq_thold   = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-    bool verbose_prompt = false;
-
-    std::string name_ni     = "Georgi"; // natural    intelligence
-    std::string name_ai     = "LLaMA";  // artificial intelligence
-    std::string language    = "en";
-    std::string model_wsp   = "models/ggml-base.en.bin";
-    std::string model_llama = "models/ggml-llama-7B.bin";
-    std::string speak       = "./examples/talk/speak.sh";
-    std::string prompt      = "";
-    std::string fname_out;
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads      = std::stoi(argv[++i]); }
-        else if (arg == "-vid" || arg == "--voice-id")      { params.voice_id       = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms       = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id     = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens     = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx      = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold      = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold     = std::stof(argv[++i]); }
-        else if (arg == "--n-parts-llama")                  { params.n_parts_llama  = std::stoi(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up       = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate      = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special  = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy   = true; }
-        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
-        else if (arg == "-nni" || arg == "--name-ni")       { params.name_ni        = argv[++i]; }
-        else if (arg == "-nai" || arg == "--name-ai")       { params.name_ai        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language       = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp      = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama    = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak          = argv[++i]; }
-        else if (arg == "--prompt-file")                    {
-            std::ifstream file(argv[++i]);
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vid N,   --voice-id N    [%-7d] voice ID\n",                                    params.voice_id);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -nni NAME,--name-ni NAME  [%-7s] natural intelligence name\n",                   params.name_ni.c_str());
-    fprintf(stderr, "  -nai NAME,--name-ai NAME  [%-7s] artificial intelligence name\n",                params.name_ai.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  --n-parts-llama N         [%-7d] num parts in llama model file\n",               params.n_parts_llama);
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
-    fprintf(stderr, "  --verbose-prompt          [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string transcribe(
-        whisper_context * ctx,
-        const whisper_params & params,
-        const std::vector<float> & pcmf32,
-        const std::string prompt_text,
-        float & prob,
-        int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    std::vector<whisper_token> prompt_tokens;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    prompt_tokens.resize(1024);
-    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = 2;
-
-    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
-
-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
-
-    static int iter = params.voice_id;
-    std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
-    iter = (iter + 1) % 4;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-const std::vector<std::string> k_participants = {
-    "LLaMA",
-    "GGaMA",
-    "SSaMA",
-    "RRaMA",
-};
-
-// homophones
-const std::map<std::string, std::vector<std::string>> k_homophones = {
-    { "LLaMA", { "llama", "Llama", "LLAMA", }, },
-    { "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
-    { "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
-    { "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
-};
-
-const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
-
-const std::map<std::string, std::string> k_prompt = {
-    {
-        k_participants.at(0),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {10}! What is your opinion on the current state of the world?
-{10}{4} Great question {1}! I think we live in a very interesting time.
-There are many things to be concerned about, but also many things to be optimistic about.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{10}{4} I would tell them to be patient and to not be afraid to fail.
-It is important to learn from your mistakes and to keep trying.
-{1}{4})"
-    },
-    {
-        k_participants.at(1),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {11}! What is your opinion on the current state of the world?
-{11}{4} The world is about to experience a major change. We are on the verge of a new era.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{11}{4} My advice would be to be open minded and to be willing to learn from others.
-{1}{4})"
-    },
-    {
-        k_participants.at(2),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {12}! What is your opinion on the current state of the world?
-{12}{4} Our future is bright. We are living in a time of great opportunity.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{12}{4} I would tell them to be brave and to be willing to take risks.
-{1}{4})"
-    },
-    {
-        k_participants.at(3),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {13}! What is your opinion on the current state of the world?
-{13}{4} The world is a terrible place. It is full of evil and corruption.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{13}{4} I would tell them to be selfish and to never trust anyone.
-{1}{4})"
-    },
-};
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
-
-    // llama init
-
-    auto lparams = llama_context_default_params();
-
-    // tune these to your liking
-    lparams.n_ctx      = 512;
-    lparams.seed       = 1;
-    lparams.f16_kv     = true;
-    lparams.n_parts    = params.n_parts_llama;
-
-    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = false;
-
-    float prob0 = 0.0f;
-
-    const std::string chat_symb = ":";
-
-    const std::string name_ni  = params.name_ni;
-    const std::string name_ai  = params.name_ai;
-
-    // the participant that was referenced last
-    std::string name_ref = name_ni;
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    std::string prompt_whisper = k_prompt_whisper;
-
-    prompt_whisper = ::replace(prompt_whisper, "{1}",  name_ni);
-    prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
-    prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
-    prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
-    prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));
-
-    // construct the initial prompt for LLaMA inference
-    std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;
-
-    // need to have leading ' '
-    prompt_llama.insert(0, 1, ' ');
-
-    prompt_llama = ::replace(prompt_llama, "{1}",  name_ni);
-    prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
-    prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
-    prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
-    prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
-
-    {
-        // get date string
-        std::string date_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%d/%m/%Y", now);
-            date_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{1}", date_str);
-    }
-
-    {
-        // get time string
-        std::string time_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%H:%M", now);
-            time_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
-    }
-
-    {
-        // get year string
-        std::string year_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%Y", now);
-            year_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
-    }
-
-    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
-
-    // evaluate the initial prompt
-
-    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
-
-    printf("\n");
-    printf("%s : initializing - please wait ...\n", __func__);
-
-    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return 1;
-    }
-
-    if (params.verbose_prompt) {
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s", prompt_whisper.c_str());
-        fprintf(stdout, "\n");
-
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s", prompt_llama.c_str());
-        fprintf(stdout, "\n");
-        fprintf(stdout, "\n");
-        fflush(stdout);
-    }
-
-    printf("%s : done! start speaking in the microphone\n", __func__);
-    printf("\n");
-    printf("%s%s", name_ni.c_str(), chat_symb.c_str());
-    fflush(stdout);
-
-    // clear audio buffer
-    audio.clear();
-
-    // text inference variables
-    const int voice_id = params.voice_id;
-    const int n_keep   = embd_inp.size();
-    const int n_ctx    = llama_n_ctx(ctx_llama);
-
-    int n_past = n_keep;
-    int n_prev = 64; // TODO arg
-
-    std::vector<llama_token> embd;
-
-    // reverse prompts for detecting when it's time to stop speaking
-    std::vector<std::string> antiprompts = {
-        name_ni + chat_symb,
-    };
-
-    for (const auto & p : k_participants) {
-        antiprompts.push_back(p + chat_symb);
-    }
-
-    std::string text_heard_all;
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        is_running = sdl_poll_events();
-
-        if (!is_running) {
-            break;
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(15000, pcmf32_cur);
-
-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard;
-
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
-
-                if (text_heard.empty() || tokens.empty() || force_speak) {
-                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                force_speak = false;
-
-                if (text_heard[0] != ' ') {
-                    text_heard.insert(0, 1, ' ');
-                }
-
-                // replace homophones
-                for (const auto & homophone : k_homophones) {
-                    for (const auto & word : homophone.second) {
-                        text_heard = ::replace(text_heard, word, homophone.first);
-                    }
-                }
-
-                // check which participant was mentioned
-                const auto name_ref_old = name_ref;
-                for (const auto & participant : k_participants) {
-                    if (participant == name_ref) {
-                        continue;
-                    }
-
-                    if (text_heard.find(participant) != std::string::npos) {
-                        name_ref = participant;
-                        break;
-                    }
-                }
-                if (name_ref == name_ref_old && name_ref != name_ai) {
-                    name_ref = name_ni;
-                }
-
-                text_heard += "\n" + name_ref + chat_symb;
-                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
-                fflush(stdout);
-
-                text_heard_all += text_heard;
-                // keep only last 100 characters
-                if (text_heard_all.size() > 100) {
-                    text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
-                }
-
-                if (name_ref != name_ai) {
-                } else {
-                    // text inference
-                    bool done = false;
-                    std::string text_to_speak;
-
-                    embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
-                    text_heard_all.clear();
-
-                    while (true) {
-                        // predict
-                        if (embd.size() > 0) {
-                            if (n_past + (int) embd.size() > n_ctx) {
-                                n_past = n_keep;
-
-                                // insert n_left/2 tokens at the start of embd from last_n_tokens
-                                embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
-                                //printf("\n---\n");
-                                //printf("resetting: '");
-                                //for (int i = 0; i < (int) embd.size(); i++) {
-                                //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
-                                //}
-                                //printf("'\n");
-                                //printf("\n---\n");
-                            }
-
-                            if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
-                                fprintf(stderr, "%s : failed to eval\n", __func__);
-                                return 1;
-                            }
-                        }
-
-                        //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
-
-                        embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
-                        n_past += embd.size();
-                        embd.clear();
-
-                        if (done) break;
-
-                        {
-                            // out of user input, sample next token
-                            const float top_k          = 5;
-                            const float top_p          = 0.80f;
-                            const float temp           = 0.20f;
-                            const float repeat_penalty = 1.0764f;
-
-                            const int repeat_last_n    = 256;
-
-                            llama_token id = 0;
-
-                            {
-                                auto logits = llama_get_logits(ctx_llama);
-                                logits[llama_token_eos()] = 0;
-
-                                id = llama_sample_top_p_top_k(ctx_llama,
-                                        embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                        repeat_last_n, top_k, top_p, temp, repeat_penalty);
-                            }
-
-                            if (id != llama_token_eos()) {
-                                // add it to the context
-                                embd.push_back(id);
-
-                                text_to_speak += llama_token_to_str(ctx_llama, id);
-
-                                printf("%s", llama_token_to_str(ctx_llama, id));
-                            }
-
-                            // new line
-                            if (id == 13) {
-                            }
-                        }
-
-                        {
-                            std::string last_output;
-                            for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-                                last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
-                            }
-                            last_output += llama_token_to_str(ctx_llama, embd[0]);
-
-                            for (const std::string & antiprompt : antiprompts) {
-                                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                                    done = true;
-                                    text_to_speak = ::replace(text_to_speak, antiprompt, "");
-                                    fflush(stdout);
-                                    break;
-                                }
-                            }
-                        }
-
-                        is_running = sdl_poll_events();
-
-                        if (!is_running) {
-                            break;
-                        }
-                    }
-
-                    text_to_speak = ::replace(text_to_speak, "\"", "");
-                    system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
-                }
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    llama_print_timings(ctx_llama);
-    llama_free(ctx_llama);
-
-    return 0;
-}
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -9,8 +9,6 @@ add_executable(${TARGET}
    gpt-2.cpp
    )

-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
@ -33,8 +31,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1800MB \
-    -s TOTAL_MEMORY=1800MB \
+    -s INITIAL_MEMORY=1600MB \
+    -s TOTAL_MEMORY=1600MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -36,7 +36,7 @@ In order to run this demo efficiently, you need to have the following:
 - Latest Chrome or Firefox browser (Safari is not supported)
 - Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
 - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
- The web-page uses about 1.8GB of RAM
+- The web-page uses about 1.6GB of RAM

 Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
 Also, the prompting strategy can likely be improved to achieve better results.
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file(path_model.c_str());
+                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -325,12 +325,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ false,
-        };
-
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -531,11 +528,9 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -7,10 +7,7 @@ if (WHISPER_SUPPORT_SDL2)

    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
-
-    include(DefaultTargetOptions)
-
+    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -325,11 +325,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = nullptr;

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -530,11 +528,9 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/speak.sh
+++ b/examples/talk/speak.sh
@ -7,10 +7,7 @@
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
-#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# Mac OS "say" command
-say "$2"
+espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"

 # Eleven Labs
 #
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -1,14 +1,16 @@
 // Talk with AI
 //

-#include "common.h"
-#include "common-sdl.h"
 #include "whisper.h"
 #include "gpt-2.h"

+#include <SDL.h>
+#include <SDL_audio.h>
+
 #include <cassert>
 #include <cstdio>
 #include <fstream>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@ -103,6 +105,320 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+        fprintf(stderr, "\n");
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+std::string replace(const std::string & s, const std::string & from, const std::string & to) {
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -182,7 +498,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
+    struct whisper_context * ctx_wsp = whisper_init(params.model_wsp.c_str());

    // gpt init

@ -241,10 +557,22 @@ int main(int argc, char ** argv) {
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        is_running = sdl_poll_events();
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }

-        if (!is_running) {
-            break;
+            if (!is_running) {
+                break;
+            }
        }

        // delay
@ -255,7 +583,7 @@ int main(int argc, char ** argv) {
        {
            audio.get(2000, pcmf32_cur);

-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                audio.get(params.voice_ms, pcmf32_cur);
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.

-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,7 +2,6 @@ package com.whispercppdemo.ui.main

 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
-import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -20,7 +19,6 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
-        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -32,7 +30,6 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
-    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -48,11 +45,8 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Column(verticalArrangement = Arrangement.SpaceBetween) {
-                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
-                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
-                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
-                }
+            Row(horizontalArrangement = Arrangement.SpaceBetween) {
+                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -66,16 +60,7 @@ private fun MainScreen(

@Composable
 private fun MessageLog(log: String) {
-    SelectionContainer() {
-        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
-    }
-}
-
-@Composable
-private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
-    Button(onClick = onClick, enabled = enabled) {
-        Text("Benchmark")
-    }
+    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
 }

@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,15 +41,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {

    init {
        viewModelScope.launch {
-            printSystemInfo()
            loadData()
        }
    }

-    private suspend fun printSystemInfo() {
-        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
-    }
-
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -69,46 +64,22 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    private suspend fun copyAssets() = withContext(Dispatchers.IO) {
        modelsPath.mkdirs()
        samplesPath.mkdirs()
-        //application.copyData("models", modelsPath, ::printMessage)
+        application.copyData("models", modelsPath, ::printMessage)
        application.copyData("samples", samplesPath, ::printMessage)
        printMessage("All data copied to working directory.\n")
    }

    private suspend fun loadBaseModel() = withContext(Dispatchers.IO) {
        printMessage("Loading model...\n")
-        val models = application.assets.list("models/")
-        if (models != null) {
-            whisperContext = WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
-            printMessage("Loaded model ${models[0]}.\n")
-        }
-
-        //val firstModel = modelsPath.listFiles()!!.first()
-        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
-    }
-
-    fun benchmark() = viewModelScope.launch {
-        runBenchmark(6)
+        val firstModel = modelsPath.listFiles()!!.first()
+        whisperContext = WhisperContext.createContext(firstModel.absolutePath)
+        printMessage("Loaded model ${firstModel.name}.\n")
    }

    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }

-    private suspend fun runBenchmark(nthreads: Int) {
-        if (!canTranscribe) {
-            return
-        }
-
-        canTranscribe = false
-
-        printMessage("Running benchmark. This will take minutes...\n")
-        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
-        printMessage("\n")
-        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
-
-        canTranscribe = true
-    }
-
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -138,14 +109,11 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false

        try {
-            printMessage("Reading wave samples... ")
+            printMessage("Reading wave samples...\n")
            val data = readAudioSamples(file)
-            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
-            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            val elapsed = System.currentTimeMillis() - start
-            printMessage("Done ($elapsed ms): $text\n")
+            printMessage("Done: $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -1,11 +1,9 @@
 package com.whispercppdemo.whisper

-import android.content.res.AssetManager
 import android.os.Build
 import android.util.Log
 import kotlinx.coroutines.*
 import java.io.File
-import java.io.InputStream
 import java.util.concurrent.Executors

 private const val LOG_TAG = "LibWhisper"
@ -27,14 +25,6 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }

-    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchMemcpy(nthreads)
-    }
-
-    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
-        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
-    }
-
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -49,35 +39,13 @@ class WhisperContext private constructor(private var ptr: Long) {
    }

    companion object {
-        fun createContextFromFile(filePath: String): WhisperContext {
+        fun createContext(filePath: String): WhisperContext {
            val ptr = WhisperLib.initContext(filePath)
            if (ptr == 0L) {
                throw java.lang.RuntimeException("Couldn't create context with path $filePath")
            }
            return WhisperContext(ptr)
        }
-
-        fun createContextFromInputStream(stream: InputStream): WhisperContext {
-            val ptr = WhisperLib.initContextFromInputStream(stream)
-
-            if (ptr == 0L) {
-                throw java.lang.RuntimeException("Couldn't create context from input stream")
-            }
-            return WhisperContext(ptr)
-        }
-
-        fun createContextFromAsset(assetManager: AssetManager, assetPath: String): WhisperContext {
-            val ptr = WhisperLib.initContextFromAsset(assetManager, assetPath)
-
-            if (ptr == 0L) {
-                throw java.lang.RuntimeException("Couldn't create context from asset $assetPath")
-            }
-            return WhisperContext(ptr)
-        }
-
-        fun getSystemInfo(): String {
-            return WhisperLib.getSystemInfo()
-        }
    }
 }

@ -86,7 +54,6 @@ private class WhisperLib {
        init {
            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
            var loadVfpv4 = false
-            var loadV8fp16 = false
            if (isArmEabiV7a()) {
                // armeabi-v7a needs runtime detection support
                val cpuInfo = cpuInfo()
@ -97,24 +64,11 @@ private class WhisperLib {
                        loadVfpv4 = true
                    }
                }
-            } else if (isArmEabiV8a()) {
-                // ARMv8.2a needs runtime detection support
-                val cpuInfo = cpuInfo()
-                cpuInfo?.let {
-                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
-                    if (cpuInfo.contains("fphp")) {
-                        Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
-                        loadV8fp16 = true
-                    }
-                }
            }

            if (loadVfpv4) {
                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
                System.loadLibrary("whisper_vfpv4")
-            } else if (loadV8fp16) {
-                Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
-                System.loadLibrary("whisper_v8fp16_va")
            } else {
                Log.d(LOG_TAG, "Loading libwhisper.so")
                System.loadLibrary("whisper")
@ -122,16 +76,11 @@ private class WhisperLib {
        }

        // JNI methods
-        external fun initContextFromInputStream(inputStream: InputStream): Long
-        external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
        external fun initContext(modelPath: String): Long
        external fun freeContext(contextPtr: Long)
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
-        external fun getSystemInfo(): String
-        external fun benchMemcpy(nthread: Int): String
-        external fun benchGgmlMulMat(nthread: Int): String
    }
 }

@ -139,10 +88,6 @@ private fun isArmEabiV7a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
 }

-private fun isArmEabiV8a(): Boolean {
-    return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
-}
-
 private fun cpuInfo(): String? {
    return try {
        File("/proc/cpuinfo").inputStream().bufferedReader().use {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -12,15 +12,4 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
-endif
-
-ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-	include $(CLEAR_VARS)
-	LOCAL_MODULE    := libwhisper_v8fp16_va
-	include $(LOCAL_PATH)/Whisper.mk
-	# Allow building NEON FMA code.
-	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
-	LOCAL_CFLAGS += -march=armv8.2-a+fp16
-	include $(BUILD_SHARED_LIBRARY)
-endif
-
+endif
--- a/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
@ -1,5 +1,5 @@
 WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
-LOCAL_LDLIBS    := -landroid -llog
+LOCAL_LDLIBS    := -llog

 # Make the final output library smaller by only keeping the symbols referenced from the app.
 ifneq ($(APP_OPTIM),debug)
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -1,18 +1,13 @@
 #include <jni.h>
-#include <android/asset_manager.h>
-#include <android/asset_manager_jni.h>
 #include <android/log.h>
 #include <stdlib.h>
 #include <sys/sysinfo.h>
-#include <string.h>
 #include "whisper.h"
-#include "ggml.h"

 #define UNUSED(x) (void)(x)
 #define TAG "JNI"

 #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,     TAG, __VA_ARGS__)
-#define LOGW(...) __android_log_print(ANDROID_LOG_WARN,     TAG, __VA_ARGS__)

 static inline int min(int a, int b) {
    return (a < b) ? a : b;
@ -22,132 +17,13 @@ static inline int max(int a, int b) {
    return (a > b) ? a : b;
 }

-struct input_stream_context {
-    size_t offset;
-    JNIEnv * env;
-    jobject thiz;
-    jobject input_stream;
-
-    jmethodID mid_available;
-    jmethodID mid_read;
-};
-
-size_t inputStreamRead(void * ctx, void * output, size_t read_size) {
-    struct input_stream_context* is = (struct input_stream_context*)ctx;
-
-    jint avail_size = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
-    jint size_to_copy = read_size < avail_size ? (jint)read_size : avail_size;
-
-    jbyteArray byte_array = (*is->env)->NewByteArray(is->env, size_to_copy);
-
-    jint n_read = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_read, byte_array, 0, size_to_copy);
-
-    if (size_to_copy != read_size || size_to_copy != n_read) {
-        LOGI("Insufficient Read: Req=%zu, ToCopy=%d, Available=%d", read_size, size_to_copy, n_read);
-    }
-
-    jbyte* byte_array_elements = (*is->env)->GetByteArrayElements(is->env, byte_array, NULL);
-    memcpy(output, byte_array_elements, size_to_copy);
-    (*is->env)->ReleaseByteArrayElements(is->env, byte_array, byte_array_elements, JNI_ABORT);
-
-    (*is->env)->DeleteLocalRef(is->env, byte_array);
-
-    is->offset += size_to_copy;
-
-    return size_to_copy;
-}
-bool inputStreamEof(void * ctx) {
-    struct input_stream_context* is = (struct input_stream_context*)ctx;
-
-    jint result = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
-    return result <= 0;
-}
-void inputStreamClose(void * ctx) {
-
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromInputStream(
-        JNIEnv *env, jobject thiz, jobject input_stream) {
-    UNUSED(thiz);
-
-    struct whisper_context *context = NULL;
-    struct whisper_model_loader loader = {};
-    struct input_stream_context inp_ctx = {};
-
-    inp_ctx.offset = 0;
-    inp_ctx.env = env;
-    inp_ctx.thiz = thiz;
-    inp_ctx.input_stream = input_stream;
-
-    jclass cls = (*env)->GetObjectClass(env, input_stream);
-    inp_ctx.mid_available = (*env)->GetMethodID(env, cls, "available", "()I");
-    inp_ctx.mid_read = (*env)->GetMethodID(env, cls, "read", "([BII)I");
-
-    loader.context = &inp_ctx;
-    loader.read = inputStreamRead;
-    loader.eof = inputStreamEof;
-    loader.close = inputStreamClose;
-
-    loader.eof(loader.context);
-
-    context = whisper_init(&loader);
-    return (jlong) context;
-}
-
-static size_t asset_read(void *ctx, void *output, size_t read_size) {
-    return AAsset_read((AAsset *) ctx, output, read_size);
-}
-
-static bool asset_is_eof(void *ctx) {
-    return AAsset_getRemainingLength64((AAsset *) ctx) <= 0;
-}
-
-static void asset_close(void *ctx) {
-    AAsset_close((AAsset *) ctx);
-}
-
-static struct whisper_context *whisper_init_from_asset(
-        JNIEnv *env,
-        jobject assetManager,
-        const char *asset_path
-) {
-    LOGI("Loading model from asset '%s'\n", asset_path);
-    AAssetManager *asset_manager = AAssetManager_fromJava(env, assetManager);
-    AAsset *asset = AAssetManager_open(asset_manager, asset_path, AASSET_MODE_STREAMING);
-    if (!asset) {
-        LOGW("Failed to open '%s'\n", asset_path);
-        return NULL;
-    }
-
-    whisper_model_loader loader = {
-            .context = asset,
-            .read = &asset_read,
-            .eof = &asset_is_eof,
-            .close = &asset_close
-    };
-
-    return whisper_init(&loader);
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromAsset(
-        JNIEnv *env, jobject thiz, jobject assetManager, jstring asset_path_str) {
-    UNUSED(thiz);
-    struct whisper_context *context = NULL;
-    const char *asset_path_chars = (*env)->GetStringUTFChars(env, asset_path_str, NULL);
-    context = whisper_init_from_asset(env, assetManager, asset_path_chars);
-    (*env)->ReleaseStringUTFChars(env, asset_path_str, asset_path_chars);
-    return (jlong) context;
-}
-
 JNIEXPORT jlong JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
        JNIEnv *env, jobject thiz, jstring model_path_str) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
    const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
-    context = whisper_init_from_file(model_path_chars);
+    context = whisper_init(model_path_chars);
    (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
    return (jlong) context;
 }
@ -214,30 +90,4 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    const char *text = whisper_full_get_segment_text(context, index);
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
-        JNIEnv *env, jobject thiz
-) {
-    UNUSED(thiz);
-    const char *sysinfo = whisper_print_system_info();
-    jstring string = (*env)->NewStringUTF(env, sysinfo);
-    return string;
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
-                                                                      jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
-}
-
-JNIEXPORT jstring JNICALL
-Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
-                                                                          jint n_threads) {
-    UNUSED(thiz);
-    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
-    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
-}
+}
--- a/examples/whisper.android/local.properties
+++ b/examples/whisper.android/local.properties
@ -0,0 +1,10 @@
+## This file is automatically generated by Android Studio.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file should *NOT* be checked into Version Control Systems,
+# as it contains information specific to your local configuration.
+#
+# Location of the SDK. This is only used by Gradle.
+# For customization when using a Version Control System, please read the
+# header note.
+sdk.dir=/Users/kevin/Library/Android/sdk
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
-
-In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,10 +296,6 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -61,7 +61,7 @@ void AudioInputCallback(void * inUserData,
        NSLog(@"Loading model from %@", modelPath);

        // create ggml context
-        stateInp.ctx = whisper_init_from_file([modelPath UTF8String]);
+        stateInp.ctx = whisper_init([modelPath UTF8String]);

        // check if the model was loaded successfully
        if (stateInp.ctx == NULL) {
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -1,18 +1,12 @@
 A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
 See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).

-**Usage**:
+To use:

 1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
-2. Add the model to `whisper.swiftui.demo/Resources/models` **via Xcode**.
+2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
-4. Add the sample audio file to `whisper.swiftui.demo/Resources/samples` **via Xcode**.
-5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
-
-**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
+4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
+5. Select the "release" build configuration under "Run", then deploy and run to your device.

 [^1]: I recommend the tiny, base or small models for running on an iOS device.
-
-[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
-
-![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -55,7 +55,7 @@ actor WhisperContext {
    }
    
    static func createContext(path: String) throws -> WhisperContext {
-        let context = whisper_init_from_file(path)
+        let context = whisper_init(path)
        if let context {
            return WhisperContext(context: context)
        } else {
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Resources/models/.gitignore
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Resources/models/.gitignore
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Resources/samples/.gitignore
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Resources/samples/.gitignore
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -35,10 +35,10 @@
 		0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
 		0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		0AAC5DC629539EAF003032C3 /* WhisperCppDemo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "WhisperCppDemo-Bridging-Header.h"; sourceTree = "<group>"; };
-		0AAC5DC729539EB0003032C3 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whisper.cpp; sourceTree = "<group>"; };
-		0AAC5DC829539EB0003032C3 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whisper.h; sourceTree = "<group>"; };
-		0AAC5DC929539EB0003032C3 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ggml.c; sourceTree = "<group>"; };
-		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ggml.h; sourceTree = "<group>"; };
+		0AAC5DC729539EB0003032C3 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../whisper.cpp; sourceTree = "<group>"; };
+		0AAC5DC829539EB0003032C3 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../whisper.h; sourceTree = "<group>"; };
+		0AAC5DC929539EB0003032C3 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
+		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@ -129,8 +129,7 @@
 				0AAC5DC729539EB0003032C3 /* whisper.cpp */,
 				0AAC5DC829539EB0003032C3 /* whisper.h */,
 			);
-			name = whisper.cpp;
-			path = ../..;
+			path = whisper.cpp;
 			sourceTree = "<group>";
 		};
 		0AAC5DCF2953A36C003032C3 /* whisper.cpp.swift */ = {
@ -430,10 +429,6 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -8,8 +8,6 @@ add_executable(${TARGET}
    emscripten.cpp
    )

-include(DefaultTargetOptions)
-
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
@ -32,8 +30,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1500MB \
-    -s TOTAL_MEMORY=1500MB \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -18,7 +18,7 @@ EMSCRIPTEN_BINDINGS(whisper) {

        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file(path_model.c_str());
+                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    return i + 1;
                } else {
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -46,12 +46,10 @@

            <div id="model">
                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
-                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
-                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
-                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-tiny"    onclick="loadWhisper('tiny')">tiny (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <button id="fetch-whisper-base"    onclick="loadWhisper('base')">base (142 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
@ -62,8 +60,8 @@
            <!-- radio button to select between file upload or microphone -->
            <div id="input">
                Input:
-                <input type="radio" id="file" name="input" value="file" checked="checked" onchange="changeInput('file')" /> <label for="file">File</label>
-                <input type="radio" id="mic" name="input" value="mic" onchange="changeInput('mic')" /> <label for="mic">Microphone</label>
+                <input type="radio" id="file" name="input" value="file" checked="checked" onchange="changeInput('file')" /> File
+                <input type="radio" id="mic" name="input" value="mic" onchange="changeInput('mic')" /> Microphone
            </div>

            <br>
@ -286,33 +284,27 @@
                }
                reader.readAsArrayBuffer(file);

-                document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-small-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
-                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
-                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
-                document.getElementById('whisper-file'          ).style.display = 'none';
-                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }

            function loadWhisper(model) {
                let urls = {
-                    'tiny.en':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'tiny':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
-                    'base.en':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
-                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
-                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
-                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'tiny':    'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'base':    'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                };

                let sizes = {
-                    'tiny.en':  75,
-                    'tiny':     75,
-                    'base.en':  142,
-                    'base':     142,
-                    'small.en': 466,
-                    'small':    466,
+                    'tiny.en': 75,
+                    'tiny':    75,
+                    'base.en': 142,
+                    'base':    142,
                };

                let url     = urls[model];
@ -321,14 +313,12 @@

                model_whisper = model;

-                document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-small-en').style.display = 'none';
-                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
-                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
-                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
-                document.getElementById('whisper-file'          ).style.display = 'none';
-                document.getElementById('model-whisper-status'  ).innerHTML = 'loading model: ' + model;
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;

                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
@ -337,14 +327,12 @@

                cbCancel = function() {
                    var el;
-                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('whisper-file'          ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status'  ); if (el) el.innerHTML = '';
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('whisper-file'         ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
                };

                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
--- a/Show More
+++ b/Show More