ggml : aligned malloc -> malloc

ggml : allocate contexts on the heap (v2)
whisper : reduce ggml_context usage
2025-06-24 17:15:19 +00:00 · 2024-10-31 21:40:11 +02:00 · 2024-10-31 21:29:48 +02:00 · 2024-10-30 13:39:14 +02:00 · 2024-10-29 19:37:24 +02:00 · 2024-10-29 19:30:26 +02:00
543 changed files with 106776 additions and 77093 deletions
--- a/.devops/cublas.Dockerfile
+++ b/.devops/cublas.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential git cmake
+    apt-get install -y build-essential git cmake libsdl2-dev

 WORKDIR /app

@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
-ENV WHISPER_CUBLAS=1
+ENV GGML_CUDA=1

 RUN make

--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -14,10 +14,10 @@ ARG CUDA_DOCKER_ARCH=all
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
-ENV WHISPER_CUBLAS=1
+ENV GGML_CUDA=1

 RUN apt-get update && \
-    apt-get install -y build-essential \
+    apt-get install -y build-essential libsdl2-dev \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

 # Ref: https://stackoverflow.com/a/53464012
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
 WORKDIR /app

 RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
+  apt-get install -y curl ffmpeg libsdl2-dev \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

 COPY --from=build /app /app
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@ -13,10 +13,10 @@ jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/setup-go@v3
+      - uses: actions/setup-go@v5
        with:
-          go-version: '^1.19'
-      - uses: actions/checkout@v1
+          go-version: '^1.23'
+      - uses: actions/checkout@v4
      - run: |
          cd bindings/go
          make test
--- a/.github/workflows/bindings-ruby.yml
+++ b/.github/workflows/bindings-ruby.yml
@ -3,20 +3,73 @@ on:
  push:
    paths:
      - bindings/ruby/**
-      - whisper.h
+      - src/whisper.cpp
+      - include/whisper.h
+      - ggml/src/ggml.c
+      - ggml/src/ggml-impl.h
+      - ggml/src/ggml-aarch64.h
+      - ggml/src/ggml-aarch64.c
+      - ggml/src/ggml-alloc.c
+      - ggml/src/ggml-backend-impl.h
+      - ggml/src/ggml-backend.cpp
+      - ggml/src/ggml-common.h
+      - ggml/src/ggml-quants.h
+      - ggml/src/ggml-quants.c
+      - ggml/src/ggml-cpu-impl.h
+      - ggml/src/ggml-metal.m
+      - ggml/src/ggml-metal.metal
+      - ggml/src/ggml-blas.cpp
+      - ggml/include/ggml.h
+      - ggml/include/ggml-alloc.h
+      - ggml/include/ggml-backend.h
+      - ggml/include/ggml-cuda.h
+      - ggml/include/ggml-kompute.h
+      - ggml/include/ggml-metal.h
+      - ggml/include/ggml-sycl.h
+      - ggml/include/ggml-vulkan.h
+      - ggml/include/ggml-blas.h
+      - scripts/get-flags.mk
+      - examples/dr_wav.h
  pull_request:
    paths:
      - bindings/ruby/**
-      - whisper.h
+      - src/whisper.cpp
+      - include/whisper.h
+      - ggml/src/ggml.c
+      - ggml/src/ggml-impl.h
+      - ggml/src/ggml-aarch64.h
+      - ggml/src/ggml-aarch64.c
+      - ggml/src/ggml-alloc.c
+      - ggml/src/ggml-backend-impl.h
+      - ggml/src/ggml-backend.cpp
+      - ggml/src/ggml-common.h
+      - ggml/src/ggml-quants.h
+      - ggml/src/ggml-quants.c
+      - ggml/src/ggml-cpu-impl.h
+      - ggml/src/ggml-metal.m
+      - ggml/src/ggml-metal.metal
+      - ggml/src/ggml-blas.cpp
+      - ggml/include/ggml.h
+      - ggml/include/ggml-alloc.h
+      - ggml/include/ggml-backend.h
+      - ggml/include/ggml-cuda.h
+      - ggml/include/ggml-kompute.h
+      - ggml/include/ggml-metal.h
+      - ggml/include/ggml-sycl.h
+      - ggml/include/ggml-vulkan.h
+      - ggml/include/ggml-blas.h
+      - scripts/get-flags.mk
+      - examples/dr_wav.h

 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: bindings/ruby
    steps:
      - uses: ruby/setup-ruby@v1
        with:
          ruby-version: '3.0'
-      - uses: actions/checkout@v1
-      - run: |
-          cd bindings/ruby/ext
-          ruby extconf.rb && make
+      - uses: actions/checkout@v4
+      - run: rake test
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -59,7 +59,7 @@ jobs:
        uses: cross-platform-actions/action@v0.24.0
        with:
          operating_system: freebsd
-          version: '13.2'
+          version: '13.3'
          run: |
            sudo pkg update
            sudo pkg install -y gmake sdl2
@ -101,7 +101,10 @@ jobs:
      fail-fast: false
      matrix:
        build: [Debug, Release]
-        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+        #arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+        # TODO: arm/v7 disabled due to clang bug
+        #       https://github.com/ggerganov/whisper.cpp/actions/runs/9657764109/job/26637633042?pr=2256#step:4:1990
+        arch: [linux/amd64, linux/arm64, linux/ppc64le]

    steps:
      - name: Clone
@ -197,7 +200,7 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
@ -247,7 +250,7 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)

  windows-msys2:
@ -289,7 +292,7 @@ jobs:
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            make WHISPER_OPENBLAS=1 -j $(nproc)
+            make GGML_OPENBLAS=1 -j $(nproc)

      - name: Build using CMake
        shell: msys2 {0}
@ -305,7 +308,7 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DWHISPER_OPENBLAS=ON
+            cmake -B build -DGGML_OPENBLAS=ON
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows:
@ -381,12 +384,9 @@ jobs:
          - arch: Win32
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
-            clblast: OFF
          - arch: x64
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
-            clblast: ON
-            clver: 1.6.1
          - sdl2: ON
            s2ver: 2.28.5

@ -413,26 +413,13 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

-      - name: Install OpenCL
-        if: matrix.clblast == 'ON'
-        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
-
-      - name: Fetch CLBlast and set CLBlast_DIR
-        if: matrix.clblast == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
-          7z x clblast.zip
-          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
-          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
-
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_OPENBLAS=${{ matrix.blas }}
+          -DGGML_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
-          -DWHISPER_CLBLAST=${{ matrix.clblast }}

      - name: Build
        run: |
@ -447,19 +434,15 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

-      - name: Copy clblast.dll
-        if: matrix.clblast == 'ON'
-        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
-
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v4
        with:
-          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
+          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  windows-cublas:
-    runs-on: windows-latest
+    runs-on: windows-2019

    strategy:
      matrix:
@ -498,7 +481,7 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUDA=${{ matrix.cublas }}
+          -DGGML_CUDA=${{ matrix.cublas }}
          -DWHISPER_SDL2=${{ matrix.sdl2 }}

      - name: Build ${{ matrix.cuda-toolkit }}
@ -603,73 +586,75 @@ jobs:
          cd whisper/examples/whisper.android
          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML

-  android_java:
-    runs-on: ubuntu-latest
+# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
+#  android_java:
+#    runs-on: ubuntu-latest
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: set up JDK 11
+#        uses: actions/setup-java@v4
+#        with:
+#          java-version: '11'
+#          distribution: 'temurin'
+#          cache: gradle
+#
+#      - name: Setup Android SDK
+#        uses: android-actions/setup-android@v3
+#        with:
+#          cmdline-tools-version: 9.0
+#
+#      - name: Build
+#        run: |
+#          cd examples/whisper.android.java
+#          chmod +x ./gradlew
+#          ./gradlew assembleRelease

-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: set up JDK 11
-        uses: actions/setup-java@v4
-        with:
-          java-version: '11'
-          distribution: 'temurin'
-          cache: gradle
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          cmdline-tools-version: 9.0
-
-      - name: Build
-        run: |
-          cd examples/whisper.android.java
-          chmod +x ./gradlew
-          ./gradlew assembleRelease
-
-  java:
-    needs: [ 'windows' ]
-    runs-on: windows-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Java
-        uses: actions/setup-java@v4
-        with:
-          distribution: zulu
-          java-version: 20
-
-      - name: Download Windows lib
-        uses: actions/download-artifact@v4
-        with:
-          name: win32-x86-64_whisper.dll
-          path: bindings/java/build/generated/resources/main/win32-x86-64
-
-      - name: Build
-        run: |
-          models\download-ggml-model.cmd tiny.en
-          cd bindings/java
-          chmod +x ./gradlew
-          ./gradlew build
-
-      - name: Upload jar
-        uses: actions/upload-artifact@v4
-        with:
-          name: whispercpp.jar
-          path: bindings/java/build/libs/whispercpp-*.jar
-
-      - name: Publish package
-        if: ${{ github.ref == 'refs/heads/master' }}
-        uses: gradle/gradle-build-action@v2.4.2
-        with:
-          arguments: publish
-          build-root-directory: bindings/java
-        env:
-          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
-          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
-          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
-          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
+#  java:
+#    needs: [ 'windows' ]
+#    runs-on: windows-latest
+#    steps:
+#      - uses: actions/checkout@v4
+#
+#      - name: Install Java
+#        uses: actions/setup-java@v4
+#        with:
+#          distribution: zulu
+#          java-version: 20
+#
+#      - name: Download Windows lib
+#        uses: actions/download-artifact@v4
+#        with:
+#          name: win32-x86-64_whisper.dll
+#          path: bindings/java/build/generated/resources/main/win32-x86-64
+#
+#      - name: Build
+#        run: |
+#          models\download-ggml-model.cmd tiny.en
+#          cd bindings/java
+#          chmod +x ./gradlew
+#          ./gradlew build
+#
+#      - name: Upload jar
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: whispercpp.jar
+#          path: bindings/java/build/libs/whispercpp-*.jar
+#
+#      - name: Publish package
+#        if: ${{ github.ref == 'refs/heads/master' }}
+#        uses: gradle/gradle-build-action@v2.4.2
+#        with:
+#          arguments: publish
+#          build-root-directory: bindings/java
+#        env:
+#          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
+#          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
+#          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
+#          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}

  quantize:
    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -18,7 +18,9 @@ jobs:
      matrix:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+          #TODO: the cuda image keeps failing - disable for now
+          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
+          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }

    steps:
      - name: Check out the repo
--- a/.gitignore
+++ b/.gitignore
@ -3,23 +3,16 @@
 .cache/
 .coreml/
 .test/
+.venv/
 .vs/
 .vscode/
 .DS_Store
 .vimspector.json
 /CMakeSettings.json
+/talk-llama.dSYM/

 build/
-build-coreml/
-build-em/
-build-debug/
-build-release/
-build-rwdi/
-build-static/
-build-cublas/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build-*/

 # SPM
 .build/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "bindings/ios"]
-	path = bindings/ios
-	url = https://github.com/ggerganov/whisper.spm
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,25 +1,31 @@
-cmake_minimum_required (VERSION 3.5)
+cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
+project("whisper.cpp" C CXX)
+project("whisper.cpp" VERSION 1.7.1)
+include(CheckIncludeFileCXX)

-# Allow for the creation of solution folders.
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-
-project(whisper.cpp VERSION 1.6.2)
 set(SOVERSION 1)

+#set(CMAKE_WARN_DEPRECATED YES)
+set(CMAKE_WARN_UNUSED_CLI YES)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
-    include(GitVars)
-    include(BuildTypes)
+
+    include(git-vars)

    # configure project version
-    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
-        configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
-    endif()
    configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
 else()
    set(WHISPER_STANDALONE OFF)
@ -29,6 +35,11 @@ if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

    option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
+
+    # TODO: without these, we get the following error:
+    #       wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -37,756 +48,136 @@ else()
    endif()
 endif()

-# options
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})

-if (APPLE)
-    set(WHISPER_METAL_DEFAULT ON)
-else()
-    set(WHISPER_METAL_DEFAULT OFF)
-endif()
+#
+# option list
+#

-option(BUILD_SHARED_LIBS              "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
+# general
+option(WHISPER_CCACHE "whisper: use ccache if available" ON)

+# debug
 option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"                   ON)
 option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)

-option(WHISPER_SANITIZE_THREAD        "whisper: enable thread sanitizer"    OFF)
-option(WHISPER_SANITIZE_ADDRESS       "whisper: enable address sanitizer"   OFF)
-option(WHISPER_SANITIZE_UNDEFINED     "whisper: enable undefined sanitizer" OFF)
-
-option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
-option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})
-
-option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)
-
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    option(WHISPER_FFMPEG                 "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
-endif()
-
-option(WHISPER_NO_AVX                 "whisper: disable AVX"         OFF)
-option(WHISPER_NO_AVX2                "whisper: disable AVX2"        OFF)
-option(WHISPER_NO_AVX512              "whisper: disable AVX512"      ON)
-option(WHISPER_NO_AVX512_VBMI         "whisper: disable AVX512-VBMI" ON)
-option(WHISPER_NO_AVX512_VNNI         "whisper: disable AVX512-VNNI" ON)
-option(WHISPER_NO_FMA                 "whisper: disable FMA"         OFF)
-option(WHISPER_NO_F16C                "whisper: disable F16c"        OFF)
-
-option(WHISPER_OPENVINO               "whisper: support for OpenVINO" OFF)
-
-if (APPLE)
-    option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
-    option(WHISPER_METAL                 "whisper: use Metal"                    ${WHISPER_METAL_DEFAULT})
-    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
-    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
-    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
-    option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
-else()
-    option(WHISPER_BLAS                  "whisper: use BLAS libraries"                        OFF)
-    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor"                       Generic)
-    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"                           OFF)
-    option(WHISPER_OPENBLAS_INTERFACE64  "whisper: use OpenBLAS w/ 64-bit interface"          OFF)
-    option(WHISPER_CUDA                  "whisper: support for CUDA"                          OFF)
-    option(WHISPER_CUBLAS                "whisper: support for CUDA (deprecated)"             OFF)
-    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS"                       OFF)
-    option(WHISPER_CLBLAST               "whisper: use CLBlast"                               OFF)
-    option(WHISPER_MKL                   "whisper: use Intel Math Kernel Library (MKL)"       OFF)
-    option(WHISPER_SYCL                  "whisper: use SYCL"                                  OFF)
-    option(WHISPER_SYCL_F16              "whisper: use 16 bit floats for sycl calculations"   OFF)
-endif()
-
-option(WHISPER_PERF "whisper: enable perf timings" OFF)
+# build
+option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)

 # sanitizers
+option(WHISPER_SANITIZE_THREAD    "whisper: enable thread sanitizer"    OFF)
+option(WHISPER_SANITIZE_ADDRESS   "whisper: enable address sanitizer"   OFF)
+option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)

-if (NOT MSVC)
-    if (WHISPER_SANITIZE_THREAD)
-        set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
-    endif()
+# extra artifacts
+option(WHISPER_BUILD_TESTS    "whisper: build tests"          ${WHISPER_STANDALONE})
+option(WHISPER_BUILD_EXAMPLES "whisper: build examples"       ${WHISPER_STANDALONE})
+option(WHISPER_BUILD_SERVER   "whisper: build server example" ${WHISPER_STANDALONE})

-    if (WHISPER_SANITIZE_ADDRESS)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
-    endif()
+# 3rd party libs
+option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
+option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)

-    if (WHISPER_SANITIZE_UNDEFINED)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
-    endif()
-endif()
-
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-
-# dependencies
-
-find_package(Threads REQUIRED)
-
-#compile flag sycl
-if (WHISPER_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
-
-if (WHISPER_FFMPEG)
-    # As of cmake 3.27, there is no official cmake support for FindFFmpeg.
-    # Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
-    # whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
-    # libswresample  performs highly optimized audio resampling, rematrixing and sample format conversion operations
-    # libavcodec provides a generic encoding/decoding framework and contains multiple decoders and encoders for audio, video and subtitle streams, and several bitstream filters.
-    # libavformat provides a generic framework for multiplexing and demultiplexing (muxing and demuxing) audio, video and subtitle streams.
-    find_package(FFmpeg REQUIRED)
-    if (NOT ${FFMPEG_FOUND})
-        message(FATAL_ERROR "Cannot find ffmpeg libs/headers")
-    endif()
-    message(STATUS "Found ffmpeg libs: ${FFMPEG_LIBRARIES}")
-    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
-    message(STATUS "ffmpeg definitions: ${FFMPEG_DEFINITIONS}")
-    message(STATUS "Found avformat ${AVFORMAT_VERSION}")
-    include_directories(${FFMPEG_INCLUDE_DIRS})
-    add_compile_definitions(WHISPER_FFMPEG)
-    set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS} ${FFMPEG_LIBRARIES})
-endif()
-
-# on APPLE
-if (APPLE)
-    # include Accelerate framework
-    if (NOT WHISPER_NO_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
-
-        if (ACCELERATE_FRAMEWORK)
-            message(STATUS "Accelerate framework found")
-
-            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64)
-        else()
-            message(FATAL_ERROR "Accelerate framework not found")
-        endif()
-    endif()
-
-    if (WHISPER_METAL)
-        find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-        find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-        find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-
-        if (METAL_FRAMEWORK)
-            message(STATUS "Metal framework found")
-
-            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
-                ${FOUNDATION_LIBRARY}
-                ${METAL_FRAMEWORK}
-                ${METALKIT_FRAMEWORK}
-                )
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)
-
-            if (WHISPER_METAL_NDEBUG)
-                set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
-            endif()
-        else()
-            message(FATAL_ERROR "Metal framework not found")
-        endif()
-
-        set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
-
-        # copy ggml-common.h and ggml-metal.metal to bin directory
-        configure_file(ggml-common.h    bin/ggml-common.h    COPYONLY)
-        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
-
-        if (WHISPER_METAL_EMBED_LIBRARY)
-            enable_language(ASM)
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
-
-            set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-            set(COMMON_HEADER   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
-
-            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-            set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
-            set(EMBED_METALLIB_SOURCE "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-combined.metal")
-
-            add_custom_command(
-                OUTPUT ${EMBED_METALLIB_SOURCE}
-                COMMAND sed -e "/^#include \\\"ggml-common.h\\\"/r ${COMMON_HEADER}" -e "/^#include \\\"ggml-common.h\\\"/d" ${METALLIB_SOURCE} > ${EMBED_METALLIB_SOURCE}
-                DEPENDS ${METALLIB_SOURCE} ${COMMON_HEADER}
-                COMMENT "Generating combined Metal library for embedding"
-            )
-
-            add_custom_command(
-                OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".incbin \\\"${EMBED_METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-                DEPENDS ${EMBED_METALLIB_SOURCE}
-                COMMENT "Generate assembly for embedded Metal library"
-            )
-
-            set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-        endif()
-    endif()
-
-    if (WHISPER_COREML)
-        find_library(FOUNDATION_FRAMEWORK Foundation)
-        find_library(COREML_FRAMEWORK CoreML)
-
-        if (COREML_FRAMEWORK)
-            message(STATUS "CoreML framework found")
-
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
-        else()
-            message(FATAL_ERROR "CoreML framework not found")
-        endif()
-
-        if (WHISPER_COREML_ALLOW_FALLBACK)
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_COREML_ALLOW_FALLBACK)
-        endif()
-    endif()
-endif()
-
-if (WHISPER_OPENBLAS)
-    set(WHISPER_BLAS_VENDOR "OpenBLAS")
-    set(WHISPER_BLAS ON)
-    # BLA_PKGCONFIG_BLAS is supported since CMake 3.25.
-    # FindBLAS.cmake pkg-config logic seems incomplete, because when
-    # BLA_SIZEOF_INTEGER is 8, then it should search for blas64 instead of blas.
-    # blas.pc/blas64.pc are not always provided, so let's be more specific
-    # and go with openblas.pc/openblas64.pc if WHISPER_OPENBLAS is on.
-    if (WHISPER_OPENBLAS_INTERFACE64)
-        set(WHISPER_BLAS_LIB "openblas64")
-    else ()
-        set(WHISPER_BLAS_LIB "openblas")
-    endif ()
-    set(BLA_PKGCONFIG_BLAS ${WHISPER_BLAS_LIB})
-    # OpenBLAS prebuilt libraries for Windows do not have "64" suffix in filename.
-    # (But .pc file has "64" suffix in filename for USE_64BITINT=1 Windows build.)
-    if (MSVC)
-        set(WHISPER_BLAS_LIB "openblas")
-    endif ()
-endif()
-
-if (WHISPER_BLAS)
-    if (NOT "$ENV{OPENBLAS_PATH}" STREQUAL "")
-        if (WHISPER_STATIC)
-            set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
-            set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
-        else ()
-            if (CMAKE_IMPORT_LIBRARY_SUFFIX)
-                set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_IMPORT_LIBRARY_PREFIX})
-                set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_IMPORT_LIBRARY_SUFFIX})
-            else ()
-                set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
-                set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
-            endif ()
-        endif ()
-        # OpenBLAS prebuilt libraries hardcode "lib" prefix in filename even on Windows
-        if (WHISPER_OPENBLAS)
-            set(WHISPER_BLAS_LIB_PREFIX "lib")
-        endif ()
-        message(STATUS "BLAS compatible library path provided")
-        set(BLAS_LIBRARIES "$ENV{OPENBLAS_PATH}/lib/${WHISPER_BLAS_LIB_PREFIX}${WHISPER_BLAS_LIB}${WHISPER_BLAS_LIB_SUFFIX}")
-        message(STATUS "Libraries ${BLAS_LIBRARIES}")
-        set(BLAS_INCLUDE_DIRS "$ENV{OPENBLAS_PATH}/include")
-        message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}")
-        if (NOT EXISTS "${BLAS_LIBRARIES}")
-            message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH misdefined.")
-        endif ()
-        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-        include_directories(${BLAS_INCLUDE_DIRS})
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
-    else ()
-        if (WHISPER_STATIC)
-            # FindBLAS.cmake pkg-config logic seems incomplete, because when
-            # BLA_STATIC is on, then it should use pkg_check_modules_static
-            # instead of pkg_check_modules.
-            # Some manual variable overriding may be necessary if you don't
-            # achieve desired results.
-            set(BLA_STATIC 1)
-        endif ()
-        set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
-        if (WHISPER_OPENBLAS_INTERFACE64)
-            set(BLA_SIZEOF_INTEGER 8)
-        else ()
-            set(BLA_SIZEOF_INTEGER 4)
-        endif()
-        set(BLA_PREFER_PKGCONFIG 1)
-        find_package(BLAS)
-
-        if(BLAS_FOUND)
-            message(STATUS "BLAS compatible library found")
-            message(STATUS "Libraries ${BLAS_LIBRARIES}")
-            if (NOT DEFINED BLAS_INCLUDE_DIRS)
-                if (PKGC_BLAS_FOUND)
-                    set(BLAS_INCLUDE_DIRS "${PKGC_BLAS_INCLUDE_DIRS}")
-                else ()
-                    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas)
-                endif()
-            endif()
-            message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}")
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-            include_directories(${BLAS_INCLUDE_DIRS})
-            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
-        else()
-            message(FATAL_ERROR "BLAS library was not found")
-        endif()
-    endif ()
-endif ()
-
-if (WHISPER_MKL)
-    find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT})
-    message(STATUS "Imported oneMKL targets: ${MKL_IMPORTED_TARGETS}")
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_BLAS_USE_MKL)
-endif()
-
-if (WHISPER_CUBLAS)
-    message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
-    set(WHISPER_CUDA ON)
-endif()
-
-if (WHISPER_CUDA)
-    cmake_minimum_required(VERSION 3.17)
-
-    find_package(CUDAToolkit)
-
-    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
-
-        enable_language(CUDA)
-
-        file(GLOB   GGML_SOURCES_CUDA "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.h)
-        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.cu)
-
-        add_compile_definitions(GGML_USE_CUDA)
-
-        if (WHISPER_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt CUDA::cufft)
-            else ()
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cufft_static)
-            endif()
-        else()
-            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cufft)
-        endif()
-
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
-    else()
-        message(FATAL_ERROR "cuBLAS not found")
-    endif()
-endif()
-
-
-if (WHISPER_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
-    endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-    endif()
-
-    find_package(hip)
-    find_package(hipblas)
-    find_package(rocblas)
-
-    if (${hipblas_FOUND} AND ${hip_FOUND})
-        message(STATUS "HIP and hipBLAS found")
-        set(GGML_HEADERS_ROCM "ggml-cuda.h")
-
-        file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
-
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        if (WHISPER_STATIC)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-        endif()
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-    else()
-        message(FATAL_ERROR "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
-    endif()
-endif()
-
-if (WHISPER_CLBLAST)
-    find_package(CLBlast)
-    if (CLBlast_FOUND)
-        message(STATUS "CLBlast found")
-
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
-
-        add_compile_definitions(GGML_USE_CLBLAST)
-
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
-    else()
-        message(FATAL_ERROR "CLBlast not found")
-    endif()
-endif()
-
-if( WHISPER_OPENVINO )
-    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-endif()
-
-if (WHISPER_SYCL)
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
-    endif()
-    #todo: AOT
-
-    find_package(IntelSYCL REQUIRED)
-    if (WHISPER_SYCL_F16)
-        add_compile_definitions(GGML_SYCL_F16)
-    endif()
-    add_compile_definitions(GGML_USE_SYCL)
-
-    add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})
-
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-
-    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
-
-    set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-endif()
-# compiler flags
-
-if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
-endif ()
-
-if (WHISPER_ALL_WARNINGS)
-    if (NOT MSVC)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
-            -Wall                           \
-            -Wextra                         \
-            -Wpedantic                      \
-            -Wshadow                        \
-            -Wcast-qual                     \
-            -Wstrict-prototypes             \
-            -Wpointer-arith                 \
-            -Wno-unused-function            \
-        ")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-            -Wall                           \
-            -Wextra                         \
-            -Wpedantic                      \
-            -Wcast-qual                     \
-        ")
-    else()
-        # todo : msvc
-    endif()
-endif()
-
-if (NOT MSVC)
-    # TODO: temporary disabled until we figure out ggml-metal.m
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
-endif()
-
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
-    message(STATUS "ARM detected")
-elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-    message(STATUS "PowerPC detected")
-else()
-    message(STATUS "x86 detected")
-    if (MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
-        if(NOT WHISPER_NO_AVX512)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX512")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (NOT WHISPER_NO_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (NOT WHISPER_NO_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-        elseif(NOT WHISPER_NO_AVX2)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-        elseif(NOT WHISPER_NO_AVX)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
-        endif()
-    else()
-        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
-        else()
-            if(NOT WHISPER_NO_AVX)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-            endif()
-            if(NOT WHISPER_NO_AVX2)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-            endif()
-            if(NOT WHISPER_NO_AVX512)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw")
-                if(NOT WHISPER_NO_AVX512_VBMI)
-                    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vbmi")
-                endif()
-                if(NOT WHISPER_NO_AVX512_VNNI)
-                    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vnni")
-                endif()
-            endif()
-            if(NOT WHISPER_NO_FMA)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-            endif()
-            if(NOT WHISPER_NO_F16C)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-            endif()
-        endif()
-    endif()
-endif()
-
-#
-# POSIX conformance
-#
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity
-# are available on Linux through GNU extensions in libc
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
+    option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
 endif()

-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
+option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
+option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
+option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)

-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

-if (WHISPER_PERF)
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
-endif()
+# override ggml options
+set(GGML_CCACHE             ${WHISPER_CCACHE})
+set(GGML_SANITIZE_THREAD    ${WHISPER_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${WHISPER_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${WHISPER_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${WHISPER_FATAL_WARNINGS})

-#
-# whisper.coreml - Core ML support
-#
-
-if (WHISPER_COREML)
-    set(TARGET whisper.coreml)
-
-    add_library(${TARGET}
-        coreml/whisper-encoder.h
-        coreml/whisper-encoder.mm
-        coreml/whisper-encoder-impl.h
-        coreml/whisper-encoder-impl.m
-        )
-
-    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PUBLIC
-        .
-        )
-
-    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
-
-    set_target_properties(${TARGET} PROPERTIES
-        COMPILE_FLAGS "-fobjc-arc"
-        )
-    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
-endif()
-
-if (WHISPER_OPENVINO)
-    set(TARGET whisper.openvino)
-
-    add_library(${TARGET} OBJECT
-        openvino/whisper-openvino-encoder.h
-        openvino/whisper-openvino-encoder.cpp
-        )
-
-    target_include_directories(${TARGET} PUBLIC
-        .
-        )
-
-    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
-
-    target_link_libraries(${TARGET} PRIVATE openvino::runtime)
-    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
-endif()
-
-#
-# whisper - this is the main library of the project
-#
-
-set(TARGET whisper)
-
-add_library(${TARGET}
-    ggml.h
-    ggml.c
-    ggml-alloc.h
-    ggml-alloc.c
-    ggml-backend.h
-    ggml-backend.c
-    ggml-quants.h
-    ggml-quants.c
-    ${GGML_SOURCES_METAL}
-    ${GGML_SOURCES_CUDA}
-    ${GGML_SOURCES_OPENCL}
-    ${GGML_SOURCES_SYCL}        ${GGML_HEADERS_SYCL}
-    ${GGML_SOURCES_ROCM}        ${GGML_HEADERS_ROCM}
-    whisper.h
-    whisper.cpp
-    )
-
-if (WHISPER_CUDA)
-    target_sources(${TARGET} PRIVATE whisper-mel-cuda.cu)
-endif()
-
-include_directories (
-    .
-)
-# Set the version numbers
-set_target_properties(whisper PROPERTIES
-    VERSION ${PROJECT_VERSION}
-    SOVERSION ${SOVERSION}
-)
-
-include(DefaultTargetOptions)
-
-target_include_directories(${TARGET} PUBLIC
-    .
-    )
-
-if (WHISPER_COREML)
-    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
-endif()
-
-if (WHISPER_OPENVINO)
-    target_link_libraries(${TARGET} PRIVATE whisper.openvino)
-endif()
-
-if (WHISPER_MKL)
-    target_link_libraries(${TARGET} PUBLIC MKL::MKL)
-endif()
-
-if (MSVC)
-    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
-
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
-else()
-    target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
-endif()
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC
-        ${CMAKE_DL_LIBS}
-        )
-
-    target_compile_definitions(${TARGET} PUBLIC
-        WHISPER_SHARED
-        GGML_SHARED
-        )
-
-    target_compile_definitions(${TARGET} PRIVATE
-        WHISPER_BUILD
-        GGML_BUILD
-        )
-
-    if (WHISPER_METAL)
-        # TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
-        #       but for some reason it does not work here like it does in llama.cpp
-        set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+# transition helpers
+function (whisper_option_depr TYPE OLD NEW)
+    if (${OLD})
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON)
    endif()
+endfunction()
+
+whisper_option_depr(FATAL_ERROR WHISPER_CUBLAS              GGML_CUDA)
+whisper_option_depr(WARNING     WHISPER_CUDA                GGML_CUDA)
+whisper_option_depr(WARNING     WHISPER_KOMPUTE             GGML_KOMPUTE)
+whisper_option_depr(WARNING     WHISPER_METAL               GGML_METAL)
+whisper_option_depr(WARNING     WHISPER_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+whisper_option_depr(WARNING     WHISPER_NATIVE              GGML_NATIVE)
+whisper_option_depr(WARNING     WHISPER_OPENMP              GGML_OPENMP)
+whisper_option_depr(WARNING     WHISPER_RPC                 GGML_RPC)
+whisper_option_depr(WARNING     WHISPER_SYCL                GGML_SYCL)
+whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
+
+#
+# build the library
+#
+
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
+add_subdirectory(src)

-if (GGML_SOURCES_CUDA)
-    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    # Only configure gmml CUDA architectures is not globally set
-    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
-        # Not overriden by user, so set defaults
-        set(GGML_CUDA_ARCHITECTURES 52 61 70)
-    endif()
-    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
-    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-endif()
-
-if (EMSCRIPTEN)
-    set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
-
-target_compile_definitions(${TARGET} PUBLIC
-    ${WHISPER_EXTRA_FLAGS}
-    )
-
-set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "ggml.h;whisper.h")
-set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+#
+# install
+#

 include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)

-install(TARGETS ${TARGET}
-    LIBRARY  DESTINATION lib
-    ARCHIVE  DESTINATION lib/static
-    RUNTIME  DESTINATION bin
-    RESOURCE DESTINATION bin
-    PUBLIC_HEADER DESTINATION include
-    )
+set(WHISPER_BUILD_NUMBER        ${BUILD_NUMBER})
+set(WHISPER_BUILD_COMMIT        ${BUILD_COMMIT})
+set(WHISPER_INSTALL_VERSION     ${CMAKE_PROJECT_VERSION})

-#
-# bindings
-#
+set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(WHISPER_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(WHISPER_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-add_subdirectory(bindings)
+get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
+
+set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
+install(TARGETS whisper LIBRARY PUBLIC_HEADER)
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper
+    PATH_VARS
+    WHISPER_INCLUDE_INSTALL_DIR
+    WHISPER_LIB_INSTALL_DIR
+    WHISPER_BIN_INSTALL_DIR )
+
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
+    VERSION ${WHISPER_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
+
+configure_file(cmake/whisper.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
+        @ONLY)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
+        DESTINATION lib/pkgconfig)

 #
 # programs, examples and tests
 #

 if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    enable_testing()
-    add_subdirectory(tests)
+    #include(CTest)
+    #add_subdirectory(tests)
 endif ()

 if (WHISPER_BUILD_EXAMPLES)
--- a/1292
+++ b/1292
--- a/Package.swift
+++ b/Package.swift
@ -27,17 +27,16 @@ let package = Package(
               "samples",
               "tests",
               "CMakeLists.txt",
-               "ggml-cuda.cu",
-               "ggml-cuda.h",
               "Makefile"
            ],
            sources: [
-                "ggml.c",
-                "whisper.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m"
+                "ggml/src/ggml.c",
+                "src/whisper.cpp",
+                "ggml/src/ggml-aarch64.c",
+                "ggml/src/ggml-alloc.c",
+                "ggml/src/ggml-backend.cpp",
+                "ggml/src/ggml-quants.c",
+                "ggml/src/ggml-metal.m"
            ],
            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
--- a/README.md
+++ b/README.md
@ -7,22 +7,23 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

 - Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
+- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
+- [4-bit and 5-bit integer quantization support](#quantization)
 - Zero memory allocations at runtime
+- [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
- [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
+- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
+- [OpenVINO Support](#openvino-support)
+- [Ascend NPU Support](#ascend-npu-support)
+- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)

 Supported platforms:

@ -34,9 +35,9 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
+- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)

-The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
+The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
 The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.

 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@ -56,8 +57,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)

 ## Implementation details

- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
+- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
+- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
@ -72,17 +73,23 @@ First clone the repository:
 git clone https://github.com/ggerganov/whisper.cpp.git
 ```

+Navigate into the directory:
+
+```
+cd whisper.cpp
+```
+
 Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:

 ```bash
-bash ./models/download-ggml-model.sh base.en
+sh ./models/download-ggml-model.sh base.en
 ```

 Now build the [main](examples/main) example and transcribe an audio file like this:

 ```bash
 # build the main example
-make
+make -j

 # transcribe an audio file
 ./main -f samples/jfk.wav
@ -93,7 +100,7 @@ make
 For a quick demo, simply run `make base.en`:

 ```text
-$ make base.en
+$ make -j base.en

 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
 c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
@ -146,7 +153,7 @@ options:
  -ng,       --no-gpu            [false  ] disable GPU


-bash ./models/download-ggml-model.sh base.en
+sh ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 ggml-base.en.bin               100%[========================>] 141.11M  6.34MB/s    in 24s
 Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@ -217,7 +224,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 If you want some extra audio samples to play with, simply run:

 ```
-make samples
+make -j samples
 ```

 This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@ -225,17 +232,18 @@ This will download a few more audio files from Wikipedia and convert them to 16-
 You can download and run the other models as follows:

 ```
-make tiny.en
-make tiny
-make base.en
-make base
-make small.en
-make small
-make medium.en
-make medium
-make large-v1
-make large-v2
-make large-v3
+make -j tiny.en
+make -j tiny
+make -j base.en
+make -j base
+make -j small.en
+make -j small
+make -j medium.en
+make -j medium
+make -j large-v1
+make -j large-v2
+make -j large-v3
+make -j large-v3-turbo
 ```

 ## Memory usage
@ -257,7 +265,7 @@ Here are the steps for creating and using a quantized model:

 ```bash
 # quantize a model with Q5_0 method
-make quantize
+make -j quantize
 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0

 # run the examples as usual, specifying the quantized model file
@ -419,31 +427,19 @@ Now build `whisper.cpp` with CUDA support:

 ```
 make clean
-WHISPER_CUDA=1 make -j
+GGML_CUDA=1 make -j
 ```

-## OpenCL GPU support via CLBlast
-
-For cards and integrated GPUs that support OpenCL, the Encoder processing can be largely offloaded to the GPU through CLBlast. This is especially useful for users with AMD APUs or low end devices for up to ~2x speedup.
-
-First, make sure you have installed `CLBlast` for your OS or Distribution: https://github.com/CNugteren/CLBlast
-
-Now build `whisper.cpp` with CLBlast support:
+## Vulkan GPU support
+Cross-vendor solution which allows you to accelerate workload on your GPU.
+First, make sure your graphics card driver provides support for Vulkan API.

+Now build `whisper.cpp` with Vulkan support:
 ```
-Makefile:
-cd whisper.cpp
 make clean
-WHISPER_CLBLAST=1 make -j
-
-CMake:
-cd whisper.cpp
-cmake -B build -DWHISPER_CLBLAST=ON
-cmake --build build -j --config Release
+make GGML_VULKAN=1 -j
 ```

-Run all the examples as usual.
-
 ## BLAS CPU support via OpenBLAS

 Encoder processing can be accelerated on the CPU via OpenBLAS.
@ -453,7 +449,7 @@ Now build `whisper.cpp` with OpenBLAS support:

 ```
 make clean
-WHISPER_OPENBLAS=1 make -j
+GGML_OPENBLAS=1 make -j
 ```

 ## BLAS CPU support via Intel MKL
@ -471,6 +467,39 @@ cmake -DWHISPER_MKL=ON ..
 WHISPER_MKL=1 make -j
 ```

+## Ascend NPU support
+
+Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores. 
+
+First, check if your Ascend NPU device is supported:
+
+**Verified devices**
+| Ascend NPU                    | Status  |
+|:-----------------------------:|:-------:|
+| Atlas 300T A2                 | Support |
+
+Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
+
+Now build `whisper.cpp` with CANN support:
+
+```
+mkdir build
+cd build
+cmake .. -D GGML_CANN=on
+make -j
+```
+
+Run the inference examples as usual, for example:
+
+```
+./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
+```
+
+*Notes:*
+
+- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
+- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
+
 ## Docker

 ### Prerequisites
@ -607,7 +636,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

 ```bash
-make stream
+make stream -j
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -774,7 +803,7 @@ took to execute it. The results are summarized in the following Github issue:

 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)

-Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
+Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).

 You can run it with the following command, by default it will run against any standard model in the models folder.

@ -821,6 +850,7 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
+  - [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -14,9 +14,14 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../..)
+INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
 LIBRARY_PATH := $(abspath ../..)

+ifeq ($(GGML_CUDA),1)
+	LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
+	BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
+endif
+
 ifeq ($(UNAME_S),Darwin)
 	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
 endif
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -62,6 +62,12 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
 make examples
 ```

+To build using cuda support add `GGML_CUDA=1`:
+
+```bash
+GGML_CUDA=1 make examples
+```
+
 The examples are placed in the `build` directory. Once built, you can download all the models with the following command:

 ```bash
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (

 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
 )

 var (
--- a/bindings/go/go.mod
+++ b/bindings/go/go.mod
@ -1,10 +1,10 @@
 module github.com/ggerganov/whisper.cpp/bindings/go

-go 1.19
+go 1.23

 require (
 	github.com/go-audio/wav v1.1.0
-	github.com/stretchr/testify v1.8.1
+	github.com/stretchr/testify v1.9.0
 )

 require (
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@ -1,4 +1,3 @@
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -9,15 +8,9 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
-github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -119,6 +119,28 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }

+func (p *Params) SetMaxContext(n int) {
+	p.n_max_text_ctx = C.int(n)
+}
+
+func (p *Params) SetBeamSize(n int) {
+	p.beam_search.beam_size = C.int(n)
+}
+
+func (p *Params) SetEntropyThold(t float32) {
+	p.entropy_thold = C.float(t)
+}
+
+func (p *Params) SetTemperature(t float32) {
+	p.temperature = C.float(t)
+}
+
+// Sets the fallback temperature incrementation
+// Pass -1.0 to disable this feature
+func (p *Params) SetTemperatureFallback(t float32) {
+	p.temperature_inc = C.float(t)
+}
+
 // Set initial prompt
 func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
@ -149,6 +171,10 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
+	str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
+	str += fmt.Sprintf(" temperature=%f", p.temperature)
+	str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
+	str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -125,6 +125,32 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }

+// Set maximum number of text context tokens to store
+func (context *context) SetMaxContext(n int) {
+	context.params.SetMaxContext(n)
+}
+
+// Set Beam Size
+func (context *context) SetBeamSize(n int) {
+	context.params.SetBeamSize(n)
+}
+
+// Set Entropy threshold
+func (context *context) SetEntropyThold(t float32) {
+	context.params.SetEntropyThold(t)
+}
+
+// Set Temperature
+func (context *context) SetTemperature(t float32) {
+	context.params.SetTemperature(t)
+}
+
+// Set the fallback temperature incrementation
+// Pass -1.0 to disable this feature
+func (context *context) SetTemperatureFallback(t float32) {
+	context.params.SetTemperatureFallback(t)
+}
+
 // Set initial prompt
 func (context *context) SetInitialPrompt(prompt string) {
 	context.params.SetInitialPrompt(prompt)
--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@ -4,52 +4,90 @@ import (
 	"os"
 	"testing"

-	// Packages
-	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-audio/wav"
 	assert "github.com/stretchr/testify/assert"
 )

-const (
-	ModelPath  = "../../models/ggml-tiny.bin"
-	SamplePath = "../../samples/jfk.wav"
-)
-
-func Test_Whisper_000(t *testing.T) {
+func TestSetLanguage(t *testing.T) {
 	assert := assert.New(t)
-	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
-		t.Skip("Skipping test, model not found:", ModelPath)
-	}
-	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
-		t.Skip("Skipping test, sample not found:", SamplePath)
-	}

-	// Load model
-	model, err := whisper.New(ModelPath)
-	assert.NoError(err)
-	assert.NotNil(model)
-	assert.NoError(model.Close())
-
-	t.Log("languages=", model.Languages())
-}
-
-func Test_Whisper_001(t *testing.T) {
-	assert := assert.New(t)
-	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
-		t.Skip("Skipping test, model not found:", ModelPath)
-	}
-	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
-		t.Skip("Skipping test, sample not found:", SamplePath)
-	}
-
-	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()

-	// Get context for decoding
-	ctx, err := model.NewContext()
+	context, err := model.NewContext()
 	assert.NoError(err)
-	assert.NotNil(ctx)

+	// This returns an error since
+	// the model 'models/ggml-small.en.bin'
+	// that is loaded is not multilingual
+	err = context.SetLanguage("en")
+	assert.Error(err)
+}
+
+func TestContextModelIsMultilingual(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	context, err := model.NewContext()
+	assert.NoError(err)
+
+	isMultilingual := context.IsMultilingual()
+
+	// This returns false since
+	// the model 'models/ggml-small.en.bin'
+	// that is loaded is not multilingual
+	assert.False(isMultilingual)
+}
+
+func TestLanguage(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	context, err := model.NewContext()
+	assert.NoError(err)
+
+	// This always returns en since
+	// the model 'models/ggml-small.en.bin'
+	// that is loaded is not multilingual
+	expectedLanguage := "en"
+	actualLanguage := context.Language()
+	assert.Equal(expectedLanguage, actualLanguage)
+}
+
+func TestProcess(t *testing.T) {
+	assert := assert.New(t)
+
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer fh.Close()
+
+	// Decode the WAV file - load the full buffer
+	dec := wav.NewDecoder(fh)
+	buf, err := dec.FullPCMBuffer()
+	assert.NoError(err)
+	assert.Equal(uint16(1), dec.NumChans)
+
+	data := buf.AsFloat32Buffer().Data
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	context, err := model.NewContext()
+	assert.NoError(err)
+
+	err = context.Process(data, nil, nil)
+	assert.NoError(err)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,17 +38,22 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language

-	SetOffset(time.Duration)        // Set offset
-	SetDuration(time.Duration)      // Set duration
-	SetThreads(uint)                // Set number of threads to use
-	SetSplitOnWord(bool)            // Set split on word flag
-	SetTokenThreshold(float32)      // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)       // Set max segment length in characters
-	SetTokenTimestamps(bool)        // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)               // Set audio encoder context
-	SetInitialPrompt(prompt string) // Set initial prompt
+	SetOffset(time.Duration)          // Set offset
+	SetDuration(time.Duration)        // Set duration
+	SetThreads(uint)                  // Set number of threads to use
+	SetSplitOnWord(bool)              // Set split on word flag
+	SetTokenThreshold(float32)        // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32)     // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)         // Set max segment length in characters
+	SetTokenTimestamps(bool)          // Set token timestamps flag
+	SetMaxTokensPerSegment(uint)      // Set max tokens per segment (0 = no limit)
+	SetAudioCtx(uint)                 // Set audio encoder context
+	SetMaxContext(n int)              // Set maximum number of text context tokens to store
+	SetBeamSize(n int)                // Set Beam Size
+	SetEntropyThold(t float32)        // Set Entropy threshold
+	SetInitialPrompt(prompt string)   // Set initial prompt
+	SetTemperature(t float32)         // Set temperature
+	SetTemperatureFallback(t float32) // Set temperature incrementation

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/go/pkg/whisper/model_test.go
+++ b/bindings/go/pkg/whisper/model_test.go
@ -0,0 +1,91 @@
+package whisper_test
+
+import (
+	"testing"
+
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	assert "github.com/stretchr/testify/assert"
+)
+
+func TestNew(t *testing.T) {
+	assert := assert.New(t)
+	t.Run("valid model path", func(t *testing.T) {
+		model, err := whisper.New(ModelPath)
+		assert.NoError(err)
+		assert.NotNil(model)
+		defer model.Close()
+
+	})
+
+	t.Run("invalid model path", func(t *testing.T) {
+		invalidModelPath := "invalid-model-path.bin"
+		model, err := whisper.New(invalidModelPath)
+		assert.Error(err)
+		assert.Nil(model)
+	})
+}
+
+func TestClose(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+
+	err = model.Close()
+	assert.NoError(err)
+}
+
+func TestNewContext(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	context, err := model.NewContext()
+	assert.NoError(err)
+	assert.NotNil(context)
+}
+
+func TestIsMultilingual(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	isMultilingual := model.IsMultilingual()
+
+	// This returns false since
+	// the model 'models/ggml-small.en.bin'
+	// that is loaded is not multilingual
+	assert.False(isMultilingual)
+}
+
+func TestLanguages(t *testing.T) {
+	assert := assert.New(t)
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	expectedLanguages := []string{
+		"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
+		"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
+		"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
+		"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
+		"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
+		"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
+		"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
+		"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
+		"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
+	}
+
+	actualLanguages := model.Languages()
+
+	assert.Equal(expectedLanguages, actualLanguages)
+}
--- a/bindings/go/pkg/whisper/util_test.go
+++ b/bindings/go/pkg/whisper/util_test.go
@ -0,0 +1,6 @@
+package whisper_test
+
+const (
+	ModelPath  = "../../models/ggml-small.en.bin"
+	SamplePath = "../../samples/jfk.wav"
+)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -9,7 +9,7 @@ import (
 // CGO

 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++
+#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.6.2",
+  "version": "1.7.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/.gitignore
+++ b/bindings/ruby/.gitignore
@ -0,0 +1,3 @@
+LICENSE
+pkg/
+lib/whisper.*
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@ -0,0 +1,111 @@
+whispercpp
+==========
+
+![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
+
+Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
+
+Installation
+------------
+
+Install the gem and add to the application's Gemfile by executing:
+
+    $ bundle add whispercpp
+
+If bundler is not being used to manage dependencies, install the gem by executing:
+
+    $ gem install whispercpp
+
+Usage
+-----
+
+```ruby
+require "whisper"
+
+whisper = Whisper::Context.new("path/to/model.bin")
+
+params = Whisper::Params.new
+params.language = "en"
+params.offset = 10_000
+params.duration = 60_000
+params.max_text_tokens = 300
+params.translate = true
+params.print_timestamps = false
+params.prompt = "Initial prompt here."
+
+whisper.transcribe("path/to/audio.wav", params) do |whole_text|
+  puts whole_text
+end
+
+```
+
+### Preparing model ###
+
+Use script to download model file(s):
+
+```bash
+git clone https://github.com/ggerganov/whisper.cpp.git
+cd whisper.cpp
+sh ./models/download-ggml-model.sh base.en
+```
+
+There are some types of models. See [models][] page for details.
+
+### Preparing audio file ###
+
+Currently, whisper.cpp accepts only 16-bit WAV files.
+
+### API ###
+
+Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
+
+```ruby
+def format_time(time_ms)
+  sec, decimal_part = time_ms.divmod(1000)
+  min, sec = sec.divmod(60)
+  hour, min = min.divmod(60)
+  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
+end
+
+whisper.transcribe("path/to/audio.wav", params)
+
+whisper.each_segment.with_index do |segment, index|
+  line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
+    nth: index + 1,
+    st: format_time(segment.start_time),
+    ed: format_time(segment.end_time),
+    text: segment.text
+  }
+  line << " (speaker turned)" if segment.speaker_next_turn?
+  puts line
+end
+
+```
+
+You can also add hook to params called on new segment:
+
+```ruby
+def format_time(time_ms)
+  sec, decimal_part = time_ms.divmod(1000)
+  min, sec = sec.divmod(60)
+  hour, min = min.divmod(60)
+  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
+end
+
+# Add hook before calling #transcribe
+params.on_new_segment do |segment|
+  line = "[%{st} --> %{ed}] %{text}" % {
+    st: format_time(segment.start_time),
+    ed: format_time(segment.end_time),
+    text: segment.text
+  }
+  line << " (speaker turned)" if segment.speaker_next_turn?
+  puts line
+end
+
+whisper.transcribe("path/to/audio.wav", params)
+
+```
+
+[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
+[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -1,12 +1,59 @@
 require 'rake/clean'
-  require 'rubygems/package'
+require "bundler/gem_tasks"
+require "pathname"
+require "yaml"
+require "rake/testtask"

-desc 'Build gem'
-task :package do
-  spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
-  spec = nil
-  # see: http://gist.github.com/16215
-  Thread.new { spec = eval("#{spec_source}") }.join
-  spec.validate
-  Gem::Package.build(spec)
+extsources = YAML.load_file("extsources.yaml")
+SOURCES = FileList[]
+extsources.each do |src|
+  basename = src.pathmap("%f")
+  dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f")
+  file src
+  file dest => src do |t|
+    cp t.source, t.name
+  end
+  SOURCES.include dest
+end
+CLEAN.include SOURCES
+CLEAN.include FileList[
+                "ext/*.o",
+                "ext/*.metal",
+                "ext/whisper.{so,bundle,dll}",
+                "ext/depend"
+              ]
+
+task build: SOURCES + FileList[
+                        "ext/extconf.rb",
+                        "ext/ruby_whisper.h",
+                        "ext/ruby_whisper.cpp",
+                        "whispercpp.gemspec",
+                      ]
+
+directory "pkg"
+CLOBBER.include "pkg"
+
+TEST_MODEL = "../../models/ggml-base.en.bin"
+LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
+LIB_FILE = File.join("lib", LIB_NAME)
+
+directory "lib"
+task LIB_FILE => SOURCES + ["lib"] do |t|
+  Dir.chdir "ext" do
+    sh "ruby extconf.rb"
+    sh "make"
+  end
+  mv "ext/#{LIB_NAME}", t.name
+end
+CLEAN.include LIB_FILE
+
+Rake::TestTask.new do |t|
+  t.test_files = FileList["tests/test_*.rb"]
+end
+task test: [TEST_MODEL, LIB_FILE]
+
+file TEST_MODEL do
+  Dir.chdir "../.." do
+    sh "./models/download-ggml-model.sh base.en"
+  end
 end
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -3,7 +3,33 @@ ggml.c
 ggml.h
 ggml-alloc.c
 ggml-alloc.h
-whisper.bundle
+ggml-aarch64.c
+ggml-aarch64.h
+ggml-backend.cpp
+ggml-backend-impl.h
+ggml-backend.c
+ggml-backend.h
+ggml-common.h
+ggml-cpu-impl.h
+ggml-metal.m
+ggml-metal.metal
+ggml-metal-embed.metal
+ggml-blas.cpp
+ggml-cuda.h
+ggml-impl.h
+ggml-kompute.h
+ggml-metal.h
+ggml-opencl.h
+ggml-quants.c
+ggml-quants.h
+ggml-sycl.h
+ggml-vulkan.h
+ggml-blas.h
+get-flags.mk
 whisper.cpp
 whisper.h
 dr_wav.h
+depend
+whisper.bundle
+whisper.so
+whisper.dll
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,20 +1,4 @@
 require 'mkmf'
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
-

 # need to use c++ compiler flags
 $CXXFLAGS << ' -std=c++11'
@ -28,4 +12,219 @@ if enable_config('march-tune-native', false)
  $CXXFLAGS << ' -march=native -mtune=native'
 end

-create_makefile('whisper')
+def with_disabling_unsupported_files
+  disabled_files = []
+
+  unless $GGML_METAL
+    disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
+  end
+
+  unless $GGML_METAL_EMBED_LIBRARY
+    disabled_files << 'ggml-metal.metal'
+  end
+
+  unless $OBJ_ALL&.include? 'ggml-blas.o'
+    disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
+  end
+
+  disabled_files.filter! {|file| File.exist? file}
+
+  disabled_files.each do |file|
+    File.rename file, "#{file}.disabled"
+  end
+
+  yield
+
+  disabled_files.each do |file|
+    File.rename "#{file}.disabled", file
+  end
+end
+
+if ENV['WHISPER_METAL']
+  $GGML_METAL ||= true
+  $DEPRECATE_WARNING ||= true
+end
+
+$UNAME_S = `uname -s`.chomp
+$UNAME_P = `uname -p`.chomp
+$UNAME_M = `uname -m`.chomp
+
+if $UNAME_S == 'Darwin'
+  unless ENV['GGML_NO_METAL']
+    $GGML_METAL ||= true
+  end
+  $GGML_NO_OPENMP ||= true
+end
+
+if $GGML_METAL
+  $GGML_METAL_EMBED_LIBRARY = true
+end
+
+$MK_CPPFLAGS = ''
+$MK_CFLAGS   = '-std=c11   -fPIC'
+$MK_CXXFLAGS = '-std=c++11 -fPIC'
+$MK_NVCCFLAGS = '-std=c++11'
+$MK_LDFLAGS = ''
+
+$OBJ_GGML = ''
+$OBJ_WHISPER = ''
+$OBJ_COMMON = ''
+$OBJ_SDL = ''
+
+$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
+
+if $UNAME_S == 'Linux'
+  $MK_CPPFLAGS << ' -D_GNU_SOURCE'
+end
+
+if $UNAME_S == 'Darwin'
+  $MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
+end
+
+if ENV['WHISPER_DEBUG']
+  $MK_CFLAGS    << ' -O0 -g'
+  $MK_CXXFLAGS  << ' -O0 -g'
+  $MK_LDFLAGS   << ' -g'
+  $MK_NVCCFLAGS << ' -O0 -g'
+else
+  $MK_CPPFLAGS   << ' -DNDEBUG'
+  $MK_CFLAGS     << ' -O3'
+  $MK_CXXFLAGS   << ' -O3'
+  $MK_NVCCFLAGS  << ' -O3'
+end
+
+$WARN_FLAGS =
+  ' -Wall' <<
+  ' -Wextra' <<
+  ' -Wpedantic' <<
+  ' -Wcast-qual' <<
+  ' -Wno-unused-function'
+
+$MK_CFLAGS <<
+  $WARN_FLAGS <<
+  ' -Wshadow' <<
+  ' -Wstrict-prototypes' <<
+  ' -Wpointer-arith' <<
+  ' -Wmissing-prototypes' <<
+  ' -Werror=implicit-int' <<
+  ' -Werror=implicit-function-declaration'
+
+$MK_CXXFLAGS <<
+  $WARN_FLAGS <<
+  ' -Wmissing-declarations' <<
+  ' -Wmissing-noreturn'
+
+unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
+  $MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
+end
+
+if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
+  $MK_CFLAGS   << ' -pthread'
+  $MK_CXXFLAGS << ' -pthread'
+end
+
+unless $_WIN32
+  $DSO_EXT = '.so'
+else
+  $DSO_EXT = '.dll'
+end
+
+unless ENV['RISCV']
+  if %w[x86_64 i686 amd64].include? $UNAME_M
+    $HOST_CXXFLAGS ||= ''
+
+    $MK_CFLAGS     << ' -march=native -mtune=native'
+    $HOST_CXXFLAGS << ' -march=native -mtune=native'
+  end
+
+  if $UNAME_M.match? /aarch64.*/
+    $MK_CFLAGS   << ' -mcpu=native'
+    $MK_CXXFLAGS << ' -mcpu=native'
+  end
+else
+  $MK_CFLAGS   << ' -march=rv64gcv -mabi=lp64d'
+  $MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
+end
+
+unless ENV['GGML_NO_ACCELERATE']
+  if $UNAME_S == 'Darwin'
+    $MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
+    $MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
+    $MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
+    $MK_LDFLAGS  << ' -framework Accelerate'
+    $OBJ_GGML    << ' ggml-blas.o'
+  end
+end
+
+if ENV['GGML_OPENBLAS']
+  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
+  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
+  $MK_LDFLAGS  << " #{`pkg-config --libs openblas`}"
+  $OBJ_GGML    << ' ggml-blas.o'
+end
+
+if ENV['GGML_OPENBLAS64']
+  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
+  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
+  $MK_LDFLAGS  << " #{`pkg-config --libs openblas64`}"
+  $OBJ_GGML    << ' ggml-blas.o'
+end
+
+if $GGML_METAL
+  $MK_CPPFLAGS << ' -DGGML_USE_METAL'
+  $MK_LDFLAGS  << ' -framework Foundation -framework Metal -framework MetalKit'
+  $OBJ_GGML    << ' ggml-metal.o'
+
+  if ENV['GGML_METAL_NDEBUG']
+    $MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
+  end
+
+  if $GGML_METAL_EMBED_LIBRARY
+    $MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
+    $OBJ_GGML    << ' ggml-metal-embed.o'
+  end
+end
+
+$OBJ_GGML <<
+  ' ggml.o' <<
+  ' ggml-alloc.o' <<
+  ' ggml-backend.o' <<
+  ' ggml-quants.o' <<
+  ' ggml-aarch64.o'
+
+$OBJ_WHISPER <<
+  ' whisper.o'
+
+$OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"
+
+$CPPFLAGS  = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
+$CFLAGS    = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
+$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
+$CXXFLAGS  = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
+$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
+$LDFLAGS   = "#{$MK_LDFLAGS} #{$LDFLAGS}"
+
+if $GGML_METAL_EMBED_LIBRARY
+  File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
+end
+
+with_disabling_unsupported_files do
+
+  create_makefile('whisper')
+
+end
+
+File.open 'Makefile', 'a' do |file|
+  file.puts 'include get-flags.mk'
+
+  if $GGML_METAL
+    if $GGML_METAL_EMBED_LIBRARY
+      # mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
+      # but ggml-metal-embed.c doesn't exist on creating Makefile.
+      file.puts "objs := $(OBJS)"
+      file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
+
+      file.puts 'include metal-embed.mk'
+    end
+  end
+end
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -1,141 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // buffer that contains a collection of buffers
-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-
-        void (*GGML_CALL free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations
-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan (not used currently)
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
-        // even if the weight has to be copied from the CPU temporarily
-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // (optional) event synchronization
-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
-    };
-
-    struct ggml_backend {
-        ggml_guid_t guid;
-
-        struct ggml_backend_i iface;
-        ggml_backend_context_t context;
-    };
-
-    struct ggml_backend_event {
-        ggml_backend_t backend;
-        void * context;
-    };
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -1,233 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend_event * ggml_backend_event_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-    };
-
-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
-
-    //
-    // Backend
-    //
-
-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
-    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // asynchronous copy
-    // the copy is performed after all the currently queued operations in backend_src
-    // backend_dst will wait for the copy to complete before performing other operations
-    // automatic fallback to sync copy if async is not supported
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // events
-    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
-    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Create a backend buffer from an existing pointer
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backends to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferrably to run on the same backend as the buffer
-        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
-
-        // initialize buffers from a max size graph (optional)
-        reserve_graph = build_graph(sched, max_batch_size);
-
-        // manually assign nodes to a backend (optional, should not be needed in most cases)
-        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
-
-        ggml_backend_sched_reserve(sched, reserve_graph);
-
-        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
-
-        // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
-        ggml_backend_sched_alloc_graph(sched, graph);
-        ggml_backend_tensor_set(input_tensor, ...);
-        ggml_backend_sched_graph_compute(sched, graph);
-    }
-    */
-
-    struct ggml_backend_sched;
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
-    //
-    // when ask == false, the scheduler is passing the node tensor to the user for observation
-    // if the user returns false, the scheduler will cancel the graph compute
-    //
-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
-    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
-    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
-
-    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-
-    // Get the number of splits of the last graph
-    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
-
-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-
-    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-cuda.h
+++ b/bindings/ruby/ext/ggml-cuda.h
@ -1,43 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_HIPBLAS
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_CUDA_MAX_DEVICES       16
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-// device buffer
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-impl.h
+++ b/bindings/ruby/ext/ggml-impl.h
@ -1,272 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    ggml_fp16_internal_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    ggml_fp16_internal_t tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
-
-#else
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
-
-struct ggml_hash_set ggml_hash_set_new(size_t size);
-
-bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// return index, asserts if table is full
-size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-#ifdef __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-metal.h
+++ b/bindings/ruby/ext/ggml-metal.h
@ -1,66 +0,0 @@
-// An interface allowing to compute ggml_cgraph with Metal
-//
-// This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
-//
-// How it works?
-//
-// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
-// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
-// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
-//
-// You only need to make sure that all memory buffers that you used during the graph creation
-// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
-// used during the graph evaluation to determine the arguments of the compute kernels.
-//
-// Synchronization between device and host memory (for example for input and output tensors)
-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stddef.h>
-#include <stdbool.h>
-
-// max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 64
-
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// backend API
-// user-code should use only these functions
-//
-
-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
-
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-
-GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
-
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-
-// helper to check if the device supports a specific family
-// ideally, the user code should be doing these checks
-// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
-
-// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
-
-#ifdef __cplusplus
-}
-#endif
-
--- a/bindings/ruby/ext/ggml-opencl.h
+++ b/bindings/ruby/ext/ggml-opencl.h
@ -1,36 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-GGML_API void ggml_cl_init(void);
-
-GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
-GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-
-// GGML_API void * ggml_cl_host_malloc(size_t size);
-// GGML_API void   ggml_cl_host_free(void * ptr);
-
-GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
-
-GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-// backend API
-
-// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
-
-// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-quants.c
+++ b/bindings/ruby/ext/ggml-quants.c
--- a/bindings/ruby/ext/ggml-sycl.h
+++ b/bindings/ruby/ext/ggml-sycl.h
@ -1,49 +0,0 @@
-//
-//  MIT license
-//  Copyright (C) 2024 Intel Corporation
-//  SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_SYCL_MAX_DEVICES       48
-#define GGML_SYCL_NAME "SYCL"
-
-// backend API
-GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
-
-// devide buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-
-GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
-GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
-GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
-GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
-GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
-
-// TODO: these are temporary
-//       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
-GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
-GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
-GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
-
-// SYCL doesn't support registering host memory, keep here for reference
-// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/ggml-vulkan.h
+++ b/bindings/ruby/ext/ggml-vulkan.h
@ -1,29 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_VK_NAME "Vulkan"
-#define GGML_VK_MAX_DEVICES 16
-
-GGML_API void ggml_vk_instance_init(void);
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-
-GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/bindings/ruby/ext/metal-embed.mk
+++ b/bindings/ruby/ext/metal-embed.mk
@ -0,0 +1,14 @@
+ggml-metal-embed.o: \
+	ggml-metal.metal \
+	ggml-common.h
+	@echo "Embedding Metal library"
+	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml-metal-embed.metal\""          >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
+	@rm -f ${TEMP_ASSEMBLY}
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -3,6 +3,13 @@

 #include "whisper.h"

+typedef struct {
+  VALUE *context;
+  VALUE user_data;
+  VALUE callback;
+  VALUE callbacks;
+} ruby_whisper_callback_container;
+
 typedef struct {
  struct whisper_context *context;
 } ruby_whisper;
@ -10,6 +17,9 @@ typedef struct {
 typedef struct {
  struct whisper_full_params params;
  bool diarize;
+  ruby_whisper_callback_container *new_segment_callback_container;
+  ruby_whisper_callback_container *progress_callback_container;
+  ruby_whisper_callback_container *abort_callback_container;
 } ruby_whisper_params;

 #endif
--- a/bindings/ruby/extsources.yaml
+++ b/bindings/ruby/extsources.yaml
@ -0,0 +1,29 @@
+---
+- ../../src/whisper.cpp
+- ../../include/whisper.h
+- ../../ggml/src/ggml.c
+- ../../ggml/src/ggml-impl.h
+- ../../ggml/src/ggml-aarch64.h
+- ../../ggml/src/ggml-aarch64.c
+- ../../ggml/src/ggml-alloc.c
+- ../../ggml/src/ggml-backend-impl.h
+- ../../ggml/src/ggml-backend.cpp
+- ../../ggml/src/ggml-common.h
+- ../../ggml/src/ggml-quants.h
+- ../../ggml/src/ggml-quants.c
+- ../../ggml/src/ggml-cpu-impl.h
+- ../../ggml/src/ggml-metal.m
+- ../../ggml/src/ggml-metal.metal
+- ../../ggml/src/ggml-blas.cpp
+- ../../ggml/include/ggml.h
+- ../../ggml/include/ggml-alloc.h
+- ../../ggml/include/ggml-backend.h
+- ../../ggml/include/ggml-cuda.h
+- ../../ggml/include/ggml-kompute.h
+- ../../ggml/include/ggml-metal.h
+- ../../ggml/include/ggml-sycl.h
+- ../../ggml/include/ggml-vulkan.h
+- ../../ggml/include/ggml-blas.h
+- ../../scripts/get-flags.mk
+- ../../examples/dr_wav.h
+- ../../LICENSE
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
@ -0,0 +1,163 @@
+require "test/unit"
+require "whisper"
+
+class TestCallback < Test::Unit::TestCase
+  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+
+  def setup
+    GC.start
+    @params = Whisper::Params.new
+    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
+    @audio = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+  end
+
+  def test_new_segment_callback
+    @params.new_segment_callback = ->(context, state, n_new, user_data) {
+      assert_kind_of Integer, n_new
+      assert n_new > 0
+      assert_same @whisper, context
+
+      n_segments = context.full_n_segments
+      n_new.times do |i|
+        i_segment = n_segments - 1 + i
+        start_time = context.full_get_segment_t0(i_segment) * 10
+        end_time = context.full_get_segment_t1(i_segment) * 10
+        text = context.full_get_segment_text(i_segment)
+
+        assert_kind_of Integer, start_time
+        assert start_time >= 0
+        assert_kind_of Integer, end_time
+        assert end_time > 0
+        assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
+      end
+    }
+
+    @whisper.transcribe(@audio, @params)
+  end
+
+  def test_new_segment_callback_closure
+    search_word = "what"
+    @params.new_segment_callback = ->(context, state, n_new, user_data) {
+      n_segments = context.full_n_segments
+      n_new.times do |i|
+        i_segment = n_segments - 1 + i
+        text = context.full_get_segment_text(i_segment)
+        if text.include?(search_word)
+          t0 = context.full_get_segment_t0(i_segment)
+          t1 = context.full_get_segment_t1(i_segment)
+          raise "search word '#{search_word}' found at between #{t0} and #{t1}"
+        end
+      end
+    }
+
+    assert_raise RuntimeError do
+      @whisper.transcribe(@audio, @params)
+    end
+  end
+
+  def test_new_segment_callback_user_data
+    udata = Object.new
+    @params.new_segment_callback_user_data = udata
+    @params.new_segment_callback = ->(context, state, n_new, user_data) {
+      assert_same udata, user_data
+    }
+
+    @whisper.transcribe(@audio, @params)
+  end
+
+  def test_new_segment_callback_user_data_gc
+    @params.new_segment_callback_user_data = "My user data"
+    @params.new_segment_callback = ->(context, state, n_new, user_data) {
+      assert_equal "My user data", user_data
+    }
+    GC.start
+
+    assert_same @whisper, @whisper.transcribe(@audio, @params)
+  end
+
+  def test_progress_callback
+    first = nil
+    last = nil
+    @params.progress_callback = ->(context, state, progress, user_data) {
+      assert_kind_of Integer, progress
+      assert 0 <= progress && progress <= 100
+      assert_same @whisper, context
+      first = progress if first.nil?
+      last = progress
+    }
+    @whisper.transcribe(@audio, @params)
+    assert_equal 0, first
+    assert_equal 100, last
+  end
+
+  def test_progress_callback_user_data
+    udata = Object.new
+    @params.progress_callback_user_data = udata
+    @params.progress_callback = ->(context, state, n_new, user_data) {
+      assert_same udata, user_data
+    }
+
+    @whisper.transcribe(@audio, @params)
+  end
+
+  def test_on_progress
+    first = nil
+    last = nil
+    @params.on_progress do |progress|
+      assert_kind_of Integer, progress
+      assert 0 <= progress && progress <= 100
+      first = progress if first.nil?
+      last = progress
+    end
+    @whisper.transcribe(@audio, @params)
+    assert_equal 0, first
+    assert_equal 100, last
+  end
+
+  def test_abort_callback
+    i = 0
+    @params.abort_callback = ->(user_data) {
+      assert_nil user_data
+      i += 1
+      return false
+    }
+    @whisper.transcribe(@audio, @params)
+    assert i > 0
+  end
+
+  def test_abort_callback_abort
+    i = 0
+    @params.abort_callback = ->(user_data) {
+      i += 1
+      return i == 3
+    }
+    @whisper.transcribe(@audio, @params)
+    assert_equal 3, i
+  end
+
+  def test_abort_callback_user_data
+    udata = Object.new
+    @params.abort_callback_user_data = udata
+    yielded = nil
+    @params.abort_callback = ->(user_data) {
+      yielded = user_data
+    }
+    @whisper.transcribe(@audio, @params)
+    assert_same udata, yielded
+  end
+
+  def test_abort_on
+    do_abort = false
+    aborted_from_callback = false
+    @params.on_new_segment do |segment|
+      do_abort = true if segment.text.match? /ask/
+    end
+    i = 0
+    @params.abort_on do
+      i += 1
+      do_abort
+    end
+    @whisper.transcribe(@audio, @params)
+    assert i > 0
+  end
+end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@ -0,0 +1,31 @@
+require 'test/unit'
+require 'tempfile'
+require 'tmpdir'
+require 'shellwords'
+
+class TestPackage < Test::Unit::TestCase
+  def test_build
+    Tempfile.create do |file|
+      assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
+      assert file.size > 0
+      assert_path_exist file.to_path
+    end
+  end
+
+  sub_test_case "Building binary on installation" do
+    def setup
+      system "rake", "build", exception: true
+    end
+
+    def test_install
+      match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
+      filename = match_data[1]
+      version = match_data[2]
+      basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
+      Dir.mktmpdir do |dir|
+        system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
+        assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
+      end
+    end
+  end
+end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -0,0 +1,155 @@
+require 'test/unit'
+require 'whisper'
+
+class TestParams < Test::Unit::TestCase
+  def setup
+    @params  = Whisper::Params.new
+  end
+
+  def test_language
+    @params.language = "en"
+    assert_equal @params.language, "en"
+    @params.language = "auto"
+    assert_equal @params.language, "auto"
+  end
+
+  def test_offset
+    @params.offset = 10_000
+    assert_equal @params.offset, 10_000
+    @params.offset = 0
+    assert_equal @params.offset, 0
+  end
+
+  def test_duration
+    @params.duration = 60_000
+    assert_equal @params.duration, 60_000
+    @params.duration = 0
+    assert_equal @params.duration, 0
+  end
+
+  def test_max_text_tokens
+    @params.max_text_tokens = 300
+    assert_equal @params.max_text_tokens, 300
+    @params.max_text_tokens = 0
+    assert_equal @params.max_text_tokens, 0
+  end
+
+  def test_translate
+    @params.translate = true
+    assert @params.translate
+    @params.translate = false
+    assert !@params.translate
+  end
+
+  def test_no_context
+    @params.no_context = true
+    assert @params.no_context
+    @params.no_context = false
+    assert !@params.no_context
+  end
+
+  def test_single_segment
+    @params.single_segment = true
+    assert @params.single_segment
+    @params.single_segment = false
+    assert !@params.single_segment
+  end
+
+  def test_print_special
+    @params.print_special = true
+    assert @params.print_special
+    @params.print_special = false
+    assert !@params.print_special
+  end
+
+  def test_print_progress
+    @params.print_progress = true
+    assert @params.print_progress
+    @params.print_progress = false
+    assert !@params.print_progress
+  end
+
+  def test_print_realtime
+    @params.print_realtime = true
+    assert @params.print_realtime
+    @params.print_realtime = false
+    assert !@params.print_realtime
+  end
+
+  def test_print_timestamps
+    @params.print_timestamps = true
+    assert @params.print_timestamps
+    @params.print_timestamps = false
+    assert !@params.print_timestamps
+  end
+
+  def test_suppress_blank
+    @params.suppress_blank = true
+    assert @params.suppress_blank
+    @params.suppress_blank = false
+    assert !@params.suppress_blank
+  end
+
+  def test_suppress_non_speech_tokens
+    @params.suppress_non_speech_tokens = true
+    assert @params.suppress_non_speech_tokens
+    @params.suppress_non_speech_tokens = false
+    assert !@params.suppress_non_speech_tokens
+  end
+
+  def test_token_timestamps
+    @params.token_timestamps = true
+    assert @params.token_timestamps
+    @params.token_timestamps = false
+    assert !@params.token_timestamps
+  end
+
+  def test_split_on_word
+    @params.split_on_word = true
+    assert @params.split_on_word
+    @params.split_on_word = false
+    assert !@params.split_on_word
+  end
+
+  def test_initial_prompt
+    assert_nil @params.initial_prompt
+    @params.initial_prompt = "You are a polite person."
+    assert_equal "You are a polite person.", @params.initial_prompt
+  end
+
+  def test_temperature
+    assert_equal 0.0, @params.temperature
+    @params.temperature = 0.5
+    assert_equal 0.5, @params.temperature
+  end
+
+  def test_max_initial_ts
+    assert_equal 1.0, @params.max_initial_ts
+    @params.max_initial_ts = 600.0
+    assert_equal 600.0, @params.max_initial_ts
+  end
+
+  def test_length_penalty
+    assert_equal -1.0, @params.length_penalty
+    @params.length_penalty = 0.5
+    assert_equal 0.5, @params.length_penalty
+  end
+
+  def test_temperature_inc
+    assert_in_delta 0.2, @params.temperature_inc
+    @params.temperature_inc = 0.5
+    assert_in_delta 0.5, @params.temperature_inc
+  end
+
+  def test_entropy_thold
+    assert_in_delta 2.4, @params.entropy_thold
+    @params.entropy_thold = 3.0
+    assert_in_delta 3.0, @params.entropy_thold
+  end
+
+  def test_logprob_thold
+    assert_in_delta -1.0, @params.logprob_thold
+    @params.logprob_thold = -0.5
+    assert_in_delta -0.5, @params.logprob_thold
+  end
+end
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -0,0 +1,87 @@
+require "test/unit"
+require "whisper"
+
+class TestSegment < Test::Unit::TestCase
+  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+
+  class << self
+    attr_reader :whisper
+
+    def startup
+      @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
+      params = Whisper::Params.new
+      params.print_timestamps = false
+      jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+      @whisper.transcribe(jfk, params)
+    end
+  end
+
+  def test_iteration
+    whisper.each_segment do |segment|
+      assert_instance_of Whisper::Segment, segment
+    end
+  end
+
+  def test_enumerator
+    enum = whisper.each_segment
+    assert_instance_of Enumerator, enum
+    enum.to_a.each_with_index do |segment, index|
+      assert_instance_of Whisper::Segment, segment
+      assert_kind_of Integer, index
+    end
+  end
+
+  def test_start_time
+    i = 0
+    whisper.each_segment do |segment|
+      assert_equal 0, segment.start_time if i == 0
+      i += 1
+    end
+  end
+
+  def test_end_time
+    i = 0
+    whisper.each_segment do |segment|
+      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
+      i += 1
+    end
+  end
+
+  def test_on_new_segment
+    params = Whisper::Params.new
+    seg = nil
+    index = 0
+    params.on_new_segment do |segment|
+      assert_instance_of Whisper::Segment, segment
+      if index == 0
+        seg = segment
+        assert_equal 0, segment.start_time
+        assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
+      end
+      index += 1
+    end
+    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
+    assert_equal 0, seg.start_time
+    assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
+  end
+
+  def test_on_new_segment_twice
+    params = Whisper::Params.new
+    seg = nil
+    params.on_new_segment do |segment|
+      seg = segment
+      return
+    end
+    params.on_new_segment do |segment|
+      assert_same seg, segment
+      return
+    end
+    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
+  end
+
+  private
+
+  def whisper
+    self.class.whisper
+  end
+end
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -1,122 +1,13 @@
-TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
-EXTDIR = File.join(TOPDIR, 'ext')
-#$LIBDIR = File.join(TOPDIR, 'lib')
-#$:.unshift(LIBDIR)
-$:.unshift(EXTDIR)
-
 require 'whisper'
 require 'test/unit'

 class TestWhisper < Test::Unit::TestCase
+  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+
  def setup
    @params  = Whisper::Params.new
  end

-  def test_language
-    @params.language = "en"
-    assert_equal @params.language, "en"
-    @params.language = "auto"
-    assert_equal @params.language, "auto"
-  end
-
-  def test_offset
-    @params.offset = 10_000
-    assert_equal @params.offset, 10_000
-    @params.offset = 0
-    assert_equal @params.offset, 0
-  end
-
-  def test_duration
-    @params.duration = 60_000
-    assert_equal @params.duration, 60_000
-    @params.duration = 0
-    assert_equal @params.duration, 0
-  end
-
-  def test_max_text_tokens
-    @params.max_text_tokens = 300
-    assert_equal @params.max_text_tokens, 300
-    @params.max_text_tokens = 0
-    assert_equal @params.max_text_tokens, 0
-  end
-
-  def test_translate
-    @params.translate = true
-    assert @params.translate
-    @params.translate = false
-    assert !@params.translate
-  end
-
-  def test_no_context
-    @params.no_context = true
-    assert @params.no_context
-    @params.no_context = false
-    assert !@params.no_context
-  end
-
-  def test_single_segment
-    @params.single_segment = true
-    assert @params.single_segment
-    @params.single_segment = false
-    assert !@params.single_segment
-  end
-
-  def test_print_special
-    @params.print_special = true
-    assert @params.print_special
-    @params.print_special = false
-    assert !@params.print_special
-  end
-
-  def test_print_progress
-    @params.print_progress = true
-    assert @params.print_progress
-    @params.print_progress = false
-    assert !@params.print_progress
-  end
-
-  def test_print_realtime
-    @params.print_realtime = true
-    assert @params.print_realtime
-    @params.print_realtime = false
-    assert !@params.print_realtime
-  end
-
-  def test_print_timestamps
-    @params.print_timestamps = true
-    assert @params.print_timestamps
-    @params.print_timestamps = false
-    assert !@params.print_timestamps
-  end
-
-  def test_suppress_blank
-    @params.suppress_blank = true
-    assert @params.suppress_blank
-    @params.suppress_blank = false
-    assert !@params.suppress_blank
-  end
-
-  def test_suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = true
-    assert @params.suppress_non_speech_tokens
-    @params.suppress_non_speech_tokens = false
-    assert !@params.suppress_non_speech_tokens
-  end
-
-  def test_token_timestamps
-    @params.token_timestamps = true
-    assert @params.token_timestamps
-    @params.token_timestamps = false
-    assert !@params.token_timestamps
-  end
-
-  def test_split_on_word
-    @params.split_on_word = true
-    assert @params.split_on_word
-    @params.split_on_word = false
-    assert !@params.split_on_word
-  end
-
  def test_whisper
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
@ -128,4 +19,81 @@ class TestWhisper < Test::Unit::TestCase
    }
  end

+  sub_test_case "After transcription" do
+    class << self
+      attr_reader :whisper
+
+      def startup
+        @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
+        params = Whisper::Params.new
+        params.print_timestamps = false
+        jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+        @whisper.transcribe(jfk, params)
+      end
+    end
+
+    def whisper
+      self.class.whisper
+    end
+
+    def test_full_n_segments
+      assert_equal 1, whisper.full_n_segments
+    end
+
+    def test_full_lang_id
+      assert_equal 0, whisper.full_lang_id
+    end
+
+    def test_full_get_segment_t0
+      assert_equal 0, whisper.full_get_segment_t0(0)
+      assert_raise IndexError do
+        whisper.full_get_segment_t0(whisper.full_n_segments)
+      end
+      assert_raise IndexError do
+        whisper.full_get_segment_t0(-1)
+      end
+    end
+
+    def test_full_get_segment_t1
+      t1 = whisper.full_get_segment_t1(0)
+      assert_kind_of Integer, t1
+      assert t1 > 0
+      assert_raise IndexError do
+        whisper.full_get_segment_t1(whisper.full_n_segments)
+      end
+    end
+
+    def test_full_get_segment_speaker_turn_next
+      assert_false whisper.full_get_segment_speaker_turn_next(0)
+    end
+
+    def test_full_get_segment_text
+      assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
+    end
+  end
+
+  def test_lang_max_id
+    assert_kind_of Integer, Whisper.lang_max_id
+  end
+
+  def test_lang_id
+    assert_equal 0, Whisper.lang_id("en")
+    assert_raise ArgumentError do
+      Whisper.lang_id("non existing language")
+    end
+  end
+
+  def test_lang_str
+    assert_equal "en", Whisper.lang_str(0)
+    assert_raise IndexError do
+      Whisper.lang_str(Whisper.lang_max_id + 1)
+    end
+  end
+
+  def test_lang_str_full
+    assert_equal "english", Whisper.lang_str_full(0)
+    assert_raise IndexError do
+      Whisper.lang_str_full(Whisper.lang_max_id + 1)
+    end
+  end
 end
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@ -1,3 +1,5 @@
+require "yaml"
+
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@ -7,10 +9,16 @@ Gem::Specification.new do |s|
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
  
-  s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
+  s.files = `git ls-files . -z`.split("\x0") +
+              YAML.load_file("extsources.yaml").collect {|file|
+                basename = File.basename(file)
+                if s.extra_rdoc_files.include?(basename)
+                  basename
+                else
+                  File.join("ext", basename)
+                end
+              }

-  #### Load-time details
-  s.require_paths = ['lib','ext']
  s.summary = %q{Ruby whisper.cpp bindings}
  s.test_files = ["tests/test_whisper.rb"]
  
--- a/cmake/BuildTypes.cmake
+++ b/cmake/BuildTypes.cmake
@ -1,54 +0,0 @@
-# Add new build types
-
-# ReleaseGG - Release with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELEASEGG
-    CMAKE_C_FLAGS_RELEASEGG
-    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
-
-# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
-endif()
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -13,5 +13,5 @@ set_target_properties(${TARGET}
    PROPERTIES
        EXPORT_COMPILE_COMMANDS ON
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
+        INSTALL_RPATH            "${CMAKE_INSTALL_PREFIX}/lib"
 )
--- a/cmake/FindFFmpeg.cmake
+++ b/cmake/FindFFmpeg.cmake
@ -36,7 +36,7 @@ include(FindPackageHandleStandardArgs)

 # The default components were taken from a survey over other FindFFMPEG.cmake files
 if (NOT FFmpeg_FIND_COMPONENTS)
-  set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE) 
+  set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
 endif()

 #
@ -84,7 +84,7 @@ macro(find_component _component _pkgconfig _library _header)

  # CMake's default is to search first for shared libraries and then for static libraries.
  # Todo later: add option to prefer static libs over dynamic:
-  find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a  
+  find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
      HINTS
      ${PC_${_component}_LIBDIR}
      ${PC_${_component}_LIBRARY_DIRS}
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@ -0,0 +1,58 @@
+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+set(BUILD_COMPILER "unknown")
+set(BUILD_TARGET "unknown")
+
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+    find_program(GIT_EXECUTABLE NAMES git git.exe)
+    if(GIT_EXECUTABLE)
+        set(Git_FOUND TRUE)
+        message(STATUS "Found Git: ${GIT_EXECUTABLE}")
+    else()
+        message(WARNING "Git not found. Build info will not be accurate.")
+    endif()
+endif()
+
+# Get the commit count and hash
+if(Git_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+    endif()
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE COUNT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_NUMBER ${COUNT})
+    endif()
+endif()
+
+if(MSVC)
+    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    execute_process(
+        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    set(BUILD_COMPILER ${OUT})
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    set(BUILD_TARGET ${OUT})
+endif()
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
--- a/cmake/whisper-config.cmake.in
+++ b/cmake/whisper-config.cmake.in
@ -0,0 +1,65 @@
+set(WHISPER_VERSION      @WHISPER_INSTALL_VERSION@)
+set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
+set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
+set(WHISPER_SHARED_LIB   @BUILD_SHARED_LIBS@)
+
+set(GGML_BLAS       @GGML_BLAS@)
+set(GGML_CUDA       @GGML_CUDA@)
+set(GGML_METAL      @GGML_METAL@)
+set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_ACCELERATE @GGML_ACCELERATE@)
+
+@PACKAGE_INIT@
+
+set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
+set_and_check(WHISPER_LIB_DIR     "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
+set_and_check(WHISPER_BIN_DIR     "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
+
+# Ensure transient dependencies satisfied
+
+find_package(Threads REQUIRED)
+
+if (APPLE AND GGML_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+endif()
+
+if (GGML_BLAS)
+    find_package(BLAS REQUIRED)
+endif()
+
+if (GGML_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (GGML_METAL)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK Metal REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+endif()
+
+if (GGML_HIPBLAS)
+    find_package(hip REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(rocblas REQUIRED)
+endif()
+
+find_library(whisper_LIBRARY whisper
+    REQUIRED
+    HINTS ${WHISPER_LIB_DIR})
+
+set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
+set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
+
+add_library(whisper UNKNOWN IMPORTED)
+
+set_target_properties(whisper
+    PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
+        INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${whisper_LIBRARY}"
+        INTERFACE_COMPILE_FEATURES cxx_std_11
+        POSITION_INDEPENDENT_CODE ON )
+
+check_required_components(whisper)
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=${prefix}/include
+
+Name: whisper
+Description: Port of OpenAI's Whisper model in C/C++
+Version: @PROJECT_VERSION@
+Libs: -L${libdir} -lwhisper
+Cflags: -I${includedir}
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -11,7 +11,7 @@ if (WHISPER_SDL2)
    string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)

    message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
-    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
+    message(STATUS "SDL2_LIBRARIES    = ${SDL2_LIBRARIES}")
 endif()

 if (WHISPER_CLBLAST)
@ -22,10 +22,35 @@ endif()

 set(TARGET common)

+unset(COMMON_EXTRA_LIBS)
+
 if (WHISPER_FFMPEG)
+    # As of cmake 3.27, there is no official cmake support for FindFFmpeg.
+    # Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
+    # whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
+    # libswresample  performs highly optimized audio resampling, rematrixing and sample format conversion operations
+    # libavcodec provides a generic encoding/decoding framework and contains multiple decoders and encoders for audio, video and subtitle streams, and several bitstream filters.
+    # libavformat provides a generic framework for multiplexing and demultiplexing (muxing and demuxing) audio, video and subtitle streams.
+    find_package(FFmpeg REQUIRED)
+
+    if (NOT ${FFMPEG_FOUND})
+        message(FATAL_ERROR "Cannot find ffmpeg libs/headers")
+    endif()
+
+    message(STATUS "Found ffmpeg libs:       ${FFMPEG_LIBRARIES}")
+    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
+    message(STATUS "ffmpeg definitions:      ${FFMPEG_DEFINITIONS}")
+    message(STATUS "Found avformat           ${AVFORMAT_VERSION}")
+
+    include_directories(${FFMPEG_INCLUDE_DIRS})
+    add_compile_definitions(WHISPER_FFMPEG)
+
+    list(APPEND COMMON_EXTRA_LIBS ${FFMPEG_LIBRARIES})
+
    set(COMMON_SOURCES_FFMPEG ffmpeg-transcode.cpp)
 endif()

+
 add_library(${TARGET} STATIC
    common.h
    common.cpp
@ -38,7 +63,7 @@ add_library(${TARGET} STATIC

 include(DefaultTargetOptions)

-target_link_libraries(${TARGET} PRIVATE whisper)
+target_link_libraries(${TARGET} PRIVATE whisper ${COMMON_EXTRA_LIBS})

 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
@ -55,8 +80,8 @@ if (WHISPER_SDL2)

    include(DefaultTargetOptions)

-    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
+    target_include_directories(${TARGET} PUBLIC  ${SDL2_INCLUDE_DIRS})
+    target_link_libraries     (${TARGET} PRIVATE ${SDL2_LIBRARIES})

    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
@ -77,8 +102,8 @@ if (EMSCRIPTEN)
    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
    set_target_properties(libcommand PROPERTIES FOLDER "libs")
-    add_subdirectory(talk.wasm)
-    set_target_properties(libtalk PROPERTIES FOLDER "libs")
+    #add_subdirectory(talk.wasm)
+    #set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
@ -102,13 +127,15 @@ endif (WHISPER_SDL2)
    add_subdirectory(quantize)
    set_target_properties(quantize PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
-    add_subdirectory(talk)
-    set_target_properties(talk PROPERTIES FOLDER "examples")
+    # TODO: disabled until update
+    #       https://github.com/ggerganov/whisper.cpp/issues/1818
+    #add_subdirectory(talk)
+    #set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
    set_target_properties(lsp PROPERTIES FOLDER "examples")
-    if (LLAMA_SYCL)
+    if (GGML_SYCL)
        add_subdirectory(sycl)
        set_target_properties(sycl PROPERTIES FOLDER "examples")
    endif()
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -18,7 +18,7 @@ struct whisper_params {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -58,7 +58,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

-int whisper_bench_full(const whisper_params & params) {
+static int whisper_bench_full(const whisper_params & params) {
    // whisper init

    struct whisper_context_params cparams = whisper_context_default_params();
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -59,7 +59,7 @@ struct whisper_params {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -130,7 +130,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

-std::string transcribe(
+static std::string transcribe(
                 whisper_context * ctx,
            const whisper_params & params,
        const std::vector<float> & pcmf32,
@ -216,7 +216,7 @@ std::string transcribe(
    return result;
 }

-std::vector<std::string> read_allowed_commands(const std::string & fname) {
+static std::vector<std::string> read_allowed_commands(const std::string & fname) {
    std::vector<std::string> allowed_commands;

    std::ifstream ifs(fname);
@ -238,7 +238,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
    return allowed_commands;
 }

-std::vector<std::string> get_words(const std::string &txt) {
+static std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;

    std::istringstream iss(txt);
@ -252,7 +252,7 @@ std::vector<std::string> get_words(const std::string &txt) {

 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: guided mode\n", __func__);

@ -463,7 +463,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const

 // always-prompt mode
 // transcribe the voice into text after valid prompt
-int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;

@ -543,7 +543,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi

 // general-purpose mode
 // freely transcribe the voice into text
-int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -72,6 +72,9 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
+        case GGML_FTYPE_MOSTLY_Q4_0_4_4:
+        case GGML_FTYPE_MOSTLY_Q4_0_4_8:
+        case GGML_FTYPE_MOSTLY_Q4_0_8_8:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -209,6 +212,11 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
+                case GGML_TYPE_Q4_0_4_4:
+                case GGML_TYPE_Q4_0_4_8:
+                case GGML_TYPE_Q4_0_8_8:
+                case GGML_TYPE_TQ1_0:
+                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -219,7 +219,7 @@ bool sdl_poll_events() {
            case SDL_QUIT:
                {
                    return false;
-                } break;
+                }
            default:
                break;
        }
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -30,7 +30,7 @@ extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t>
 #endif

 // Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
+static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
    if (i + 1 < argc && argv[i + 1][0] != '-') {
        return argv[++i];
    } else {
@ -147,7 +147,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
-        default: return "To";
    }

    return "The";
@ -346,7 +345,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
    return tokens;
 }

-std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
+static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
    std::vector<gpt_vocab::id> output;
    std::stringstream ss(input);
    std::string token;
@ -358,7 +357,7 @@ std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, ch
    return output;
 }

-std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
+static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
    if (fpath_test.empty()){
        fprintf(stderr, "%s : No test file found.\n", __func__);
        return std::map<std::string, std::vector<gpt_vocab::id>>();
--- a/examples/common.h
+++ b/examples/common.h
@ -9,6 +9,7 @@
 #include <thread>
 #include <ctime>
 #include <fstream>
+#include <sstream>

 #define COMMON_SAMPLE_RATE 16000

@ -21,7 +22,7 @@ struct gpt_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict    = 200;  // new tokens to predict
    int32_t n_parallel   = 1;    // number of parallel streams
-    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_batch      = 32;   // batch size for prompt processing
    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU

@ -286,12 +287,43 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
 // Terminal utils
 //

+#define SQR(X)    ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40

-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
+/**
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
+ */
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static std::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    std::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+// Lowest is red, middle is yellow, highest is green. Color scheme from
+// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
 const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
 };

 //
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@ -321,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
        LOG("Couldn't map input file %s\n", ifname.c_str());
        return err;
    }
-    LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
+    LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
    struct audio_buffer inaudio_buf;
    inaudio_buf.ptr = ibuf;
    inaudio_buf.size = ibuf_size;
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -9,7 +9,7 @@
 namespace grammar_parser {
    // NOTE: assumes valid utf8 (but checks for overrun)
    // copied from whisper.cpp
-    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
        uint8_t  first_byte = static_cast<uint8_t>(*src);
        uint8_t  highbits   = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }

-    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }

-    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
        return next_id;
    }

-    void add_rule(
+    static void add_rule(
            parse_state & state,
            uint32_t      rule_id,
            const std::vector<whisper_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }

-    bool is_word_char(char c) {
+    static bool is_word_char(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }

-    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
        const char * pos   = src;
        const char * end   = src + size;
        uint32_t     value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }

-    const char * parse_space(const char * src, bool newline_ok) {
+    static const char * parse_space(const char * src, bool newline_ok) {
        const char * pos = src;
        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
        return pos;
    }

-    const char * parse_name(const char * src) {
+    static const char * parse_name(const char * src) {
        const char * pos = src;
        while (is_word_char(*pos)) {
            pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
        return pos;
    }

-    std::pair<uint32_t, const char *> parse_char(const char * src) {
+    static std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
                case 'x': return parse_hex(src + 2, 2);
@ -122,14 +122,14 @@ namespace grammar_parser {
        throw std::runtime_error("unexpected end of input");
    }

-    const char * parse_alternates(
+    static const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);

-    const char * parse_sequence(
+    static const char * parse_sequence(
            parse_state                        & state,
            const char                         * src,
            const std::string                  & rule_name,
@ -229,7 +229,7 @@ namespace grammar_parser {
        return pos;
    }

-    const char * parse_alternates(
+    static const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
        return pos;
    }

-    const char * parse_rule(parse_state & state, const char * src) {
+    static const char * parse_rule(parse_state & state, const char * src) {
        const char * name_end = parse_name(src);
        const char * pos      = parse_space(name_end, false);
        size_t       name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
        }
    }

-    void print_grammar_char(FILE * file, uint32_t c) {
+    static void print_grammar_char(FILE * file, uint32_t c) {
        if (0x20 <= c && c <= 0x7f) {
            fprintf(file, "%c", static_cast<char>(c));
        } else {
@ -294,7 +294,7 @@ namespace grammar_parser {
        }
    }

-    bool is_char_element(whisper_grammar_element elem) {
+    static bool is_char_element(whisper_grammar_element elem) {
        switch (elem.type) {
            case WHISPER_GRETYPE_CHAR:           return true;
            case WHISPER_GRETYPE_CHAR_NOT:       return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
        }
    }

-    void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
+    static void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
        for (auto elem : rule) {
            switch (elem.type) {
                case WHISPER_GRETYPE_END:            fprintf(file, "END");            break;
@ -334,7 +334,7 @@ namespace grammar_parser {
        fprintf(file, "\n");
    }

-    void print_rule(
+    static void print_rule(
            FILE     * file,
            uint32_t   rule_id,
            const std::vector<whisper_grammar_element> & rule,
@ -413,7 +413,7 @@ namespace grammar_parser {
        }
    }

-    std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
+    std::vector<const whisper_grammar_element *> parse_state::c_rules() const {
        std::vector<const whisper_grammar_element *> ret;
        for (const auto & rule : rules) {
            ret.push_back(rule.data());
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )

 # list available models
 function list_models {
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -53,7 +53,7 @@ struct commandset {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -109,7 +109,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }
-uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
+static uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
    using namespace std::chrono;
    uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
    uint64_t start_time = time_now;
@ -153,7 +153,7 @@ uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params &
    return time_now;
 }

-json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
+static json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
    std::vector<whisper_token> prompt_tokens;
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
@ -199,7 +199,7 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js

 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
+static json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
    struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
@ -285,7 +285,7 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
    }
 }

-json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
+static json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
    // TODO: check for token collision
    struct commandset cs;

@ -325,7 +325,8 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
    commandset_list.push_back(cs);
    return json{{"index",index}};
 }
-json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
+
+static json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
    // whisper_state has the pertinent offsets, but there also seem to be a large
    // number of scratch buffers that would prevent rewinding context in a manner similar to llama
    // I'll give this a another pass once everything else is implemented,
@ -335,7 +336,8 @@ json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*para
            {"message", "Seeking is not yet supported."}
    };
 }
-json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
+
+static json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
    // See: https://www.jsonrpc.org/specification
    json id = body.at("id");
    try {
@ -375,7 +377,7 @@ json parse_job(const json &body, struct whisper_context * ctx, audio_async &audi
    }
 }

-void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+static void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    std::deque<json> jobqueue;
    std::vector<struct commandset> commandset_list;
    while (true) {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -17,7 +17,7 @@
 #endif

 // helper function to replace substrings
-void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
        pos = s.find(search, pos);
        if (pos == std::string::npos) break;
@ -94,17 +94,17 @@ struct whisper_params {
    grammar_parser::parse_state grammar_parsed;
 };

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-char* whisper_param_turn_lowercase(char* in){
+static char * whisper_param_turn_lowercase(char * in){
    int string_len = strlen(in);
-    for(int i = 0; i < string_len; i++){
+    for (int i = 0; i < string_len; i++){
        *(in+i) = tolower((unsigned char)*(in+i));
    }
    return in;
 }

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -182,7 +182,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
@ -248,7 +248,7 @@ struct whisper_print_user_data {
    int progress_prev;
 };

-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
+static std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();

@ -280,7 +280,8 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s

    return speaker;
 }
-void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
+
+static void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
    if (progress >= *progress_prev + progress_step) {
@ -289,7 +290,7 @@ void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct wh
    }
 }

-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+static void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -358,7 +359,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
    }
 }

-bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -385,7 +386,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -417,7 +418,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -446,7 +447,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-char *escape_double_quotes_and_backslashes(const char *str) {
+static char * escape_double_quotes_and_backslashes(const char * str) {
    if (str == NULL) {
        return NULL;
    }
@ -459,7 +460,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
        }
    }

-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    char * escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
    if (escaped == NULL) {
        return NULL;
    }
@ -478,7 +479,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
 }

 // double quote should be escaped by another double quote. (rfc4180)
-char *escape_double_quotes_in_csv(const char *str) {
+static char * escape_double_quotes_in_csv(const char * str) {
    if (str == NULL) {
        return NULL;
    }
@ -509,7 +510,7 @@ char *escape_double_quotes_in_csv(const char *str) {
    return escaped;
 }

-bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -544,7 +545,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
+static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

@ -563,7 +564,7 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
    return true;
 }

-bool output_json(
+static bool output_json(
             struct whisper_context * ctx,
                         const char * fname,
               const whisper_params & params,
@ -734,7 +735,7 @@ bool output_json(
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
+static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -859,7 +860,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
    return true;
 }

-bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -900,7 +901,7 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
 }


-void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
+static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }

 int main(int argc, char ** argv) {
    whisper_params params;
@ -996,6 +997,7 @@ int main(int argc, char ** argv) {
        if (params.dtw == "large.v1")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
        if (params.dtw == "large.v2")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
        if (params.dtw == "large.v3")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
+        if (params.dtw == "large.v3.turbo")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;

        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
    if not os.path.exists(wav_file):
        raise FileNotFoundError(f"WAV file not found: {wav_file}")

-    full_command = f"./main -m {model} -f {wav_file} -np -nt"
+    full_command = f"./main -m {model} -f {wav_file} -nt"

    # Execute the command
    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -36,7 +36,7 @@ struct whisper_filters {
 };

 // quantize a model
-bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+static bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;

    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -34,6 +34,7 @@ struct server_params
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
    std::string request_path = "";
+    std::string inference_path = "/inference";

    int32_t port          = 8080;
    int32_t read_timeout  = 600;
@ -132,6 +133,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
+    fprintf(stderr, "  --inference-path PATH,         [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -182,6 +184,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
+        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -216,7 +219,7 @@ void check_ffmpeg_availibility() {
 bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
    std::ostringstream cmd_stream;
    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
    std::string cmd = cmd_stream.str();

    int status = std::system(cmd.c_str());
@ -644,10 +647,10 @@ int main(int argc, char ** argv) {
        return false;
    });

-    svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
+    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
    });

-    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);

@ -674,7 +677,8 @@ int main(int argc, char ** argv) {
        if (sparams.ffmpeg_converter) {
            // if file is not wav, convert to wav
            // write to temporary file
-            const std::string temp_filename = "whisper_server_temp_file.wav";
+            const std::string temp_filename_base = std::tmpnam(nullptr);
+            const std::string temp_filename = temp_filename_base + ".wav";
            std::ofstream temp_file{temp_filename, std::ios::binary};
            temp_file << audio_file.content;
            temp_file.close();
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -44,7 +44,7 @@ struct whisper_params {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,7 +1,13 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
+    add_executable(${TARGET} talk-llama.cpp
+        llama.cpp
+        llama-vocab.cpp
+        llama-grammar.cpp
+        llama-sampling.cpp
+        unicode.cpp
+        unicode-data.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})

    if (WHISPER_CLBLAST)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
--- a/examples/talk-llama/llama-grammar.h
+++ b/examples/talk-llama/llama-grammar.h
@ -0,0 +1,144 @@
+#pragma once
+
+#include "llama-impl.h"
+
+#include <map>
+
+struct llama_vocab;
+
+// grammar element type
+enum llama_gretype {
+    // end of rule definition
+    LLAMA_GRETYPE_END            = 0,
+
+    // start of alternate definition for rule
+    LLAMA_GRETYPE_ALT            = 1,
+
+    // non-terminal element: reference to rule
+    LLAMA_GRETYPE_RULE_REF       = 2,
+
+    // terminal element: character (code point)
+    LLAMA_GRETYPE_CHAR           = 3,
+
+    // inverse char(s) ([^a], [^a-b] [^abc])
+    LLAMA_GRETYPE_CHAR_NOT       = 4,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+    // be an inclusive range ([a-z])
+    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or
+    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+    LLAMA_GRETYPE_CHAR_ALT       = 6,
+
+    // any character (.)
+    LLAMA_GRETYPE_CHAR_ANY       = 7,
+};
+
+typedef struct llama_grammar_element {
+    enum llama_gretype type;
+    uint32_t           value; // Unicode code point or rule ID
+} llama_grammar_element;
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+using llama_grammar_rule  = std::vector<      llama_grammar_element>;
+using llama_grammar_stack = std::vector<const llama_grammar_element *>;
+
+using llama_grammar_rules      = std::vector<llama_grammar_rule>;
+using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
+using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+
+const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
+      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `llama_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void llama_grammar_accept(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+                          uint32_t   chr,
+              llama_grammar_stacks & stacks_new);
+
+std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stack      & stack,
+        const llama_grammar_candidates & candidates);
+
+struct llama_grammar_parser {
+    std::map<std::string, uint32_t> symbol_ids;
+
+    llama_grammar_rules rules;
+
+    llama_grammar_stack c_rules() const;
+
+    uint32_t get_symbol_id(const char * src, size_t len);
+    uint32_t generate_symbol_id(const std::string & base_name);
+
+    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
+
+    const char * parse_alternates(
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            const char         * src,
+            const std::string  & rule_name,
+            llama_grammar_rule & rule,
+            bool               is_nested);
+
+    const char * parse_rule(const char * src);
+
+    bool parse(const char * src);
+    void print(FILE * file);
+};
+
+struct llama_grammar {
+    // note: allow null vocab for testing (not great)
+    const llama_vocab * vocab;
+
+    const llama_grammar_rules  rules;  // TODO: shared ptr
+          llama_grammar_stacks stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8 partial_utf8;
+};
+
+//
+// internal API
+//
+
+// note: needed for tests (not great)
+struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
+        const llama_grammar_element ** rules,
+        size_t n_rules,
+        size_t start_rule_index);
+
+struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
+
+void llama_grammar_free_impl(struct llama_grammar * grammar);
+
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
+
+// TODO: move the API below as member functions of llama_grammar
+void llama_grammar_apply_impl(
+        const struct llama_grammar & grammar,
+            llama_token_data_array * cur_p);
+
+void llama_grammar_accept_impl(
+              struct llama_grammar & grammar,
+                       llama_token   token);
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@ -0,0 +1,181 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal        (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
+//
+// helpers
+//
+
+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+    ~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+);
+
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (capacity == 0) {
+            throw std::runtime_error("ring buffer: capacity is zero");
+        }
+
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    //T & operator[](size_t i) {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    //const T & at(size_t i) const {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
--- a/examples/talk-llama/llama-sampling.h
+++ b/examples/talk-llama/llama-sampling.h
@ -0,0 +1,29 @@
+#pragma once
+
+// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
+
+#include "llama-grammar.h"
+
+#include <unordered_map>
+
+struct llama_vocab;
+struct llama_grammar;
+
+// sampler chain
+
+struct llama_sampler_chain {
+    llama_sampler_chain_params params;
+
+    std::vector<struct llama_sampler *> samplers;
+
+    // timing
+
+    mutable int64_t t_sample_us;
+
+    mutable int32_t n_sample;
+};
+
+struct llama_sampler * llama_sampler_init_grammar_impl(
+        const struct llama_vocab & vocab,
+                      const char * grammar_str,
+                      const char * grammar_root);
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
--- a/examples/talk-llama/llama-vocab.h
+++ b/examples/talk-llama/llama-vocab.h
@ -0,0 +1,146 @@
+#pragma once
+
+#include "llama-impl.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <set>
+
+struct llm_tokenizer;
+
+struct llama_vocab {
+    using id    = llama_token;
+    using token = std::string;
+    using tattr = llama_token_attr;
+
+    struct token_data {
+        token text;
+        float score;
+        tattr attr;
+    };
+
+    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+
+    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+
+    int max_token_len = 0; // used for optimizing longest token search
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::vector<id>    cache_special_tokens;
+    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id  = 1;
+    id special_eos_id  = 2;
+    id special_unk_id  = 0;
+    id special_sep_id  = -1;
+    id special_pad_id  = -1;
+    id special_cls_id  = -1;
+    id special_mask_id = -1;
+
+    id linefeed_id       = 13;
+    id special_prefix_id = -1;
+    id special_suffix_id = -1;
+    id special_middle_id = -1;
+    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = -1;
+
+    // set of all tokens that cause "end of generation"
+    std::set<id> special_eog_ids;
+
+    // tokenizer flags
+    bool tokenizer_add_space_prefix           = false;
+    bool tokenizer_add_bos                    = false;
+    bool tokenizer_add_eos                    = false;
+    bool tokenizer_ignore_merges              = false;
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
+    bool tokenizer_remove_extra_whitespaces   = false;
+    bool tokenizer_escape_whitespaces         = true;
+    bool tokenizer_treat_whitespace_as_suffix = false;
+
+    std::vector<char> precompiled_charsmap;
+
+    llm_tokenizer * tokenizer = nullptr;
+
+    llama_vocab() = default;
+    ~llama_vocab();
+
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    void init_tokenizer();
+};
+
+//
+// internal API
+//
+
+// TODO: rename to llama_tokenize_impl
+// TODO: This should probably be in llama.h
+std::vector<llama_vocab::id> llama_tokenize_internal(
+        const llama_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special = false);
+
+// TODO: move the API below as member functions of llama_vocab
+llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
+
+const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
+
+float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
+llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
+llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
+
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
+
+llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
+llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
+llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
+
+int32_t llama_tokenize_impl(
+        const struct llama_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+// does not write null-terminator to buf
+int32_t llama_token_to_piece_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token,
+                            char * buf,
+                         int32_t   length,
+                         int32_t   lstrip,
+                            bool   special);
+
+int32_t llama_detokenize_impl(
+        const struct llama_vocab & vocab,
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -16,7 +16,7 @@
 #include <regex>
 #include <sstream>

-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);

    // upper limit for the number of tokens
@ -33,12 +33,12 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
    return result;
 }

-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -83,7 +83,7 @@ struct whisper_params {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -168,7 +168,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

-std::string transcribe(
+static std::string transcribe(
        whisper_context * ctx,
        const whisper_params & params,
        const std::vector<float> & pcmf32,
@ -235,7 +235,7 @@ std::string transcribe(
    return result;
 }

-std::vector<std::string> get_words(const std::string &txt) {
+static std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;

    std::istringstream iss(txt);
@ -314,7 +314,6 @@ int main(int argc, char ** argv) {

    // tune these to your liking
    lcparams.n_ctx      = 2048;
-    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;

@ -402,6 +401,26 @@ int main(int argc, char ** argv) {

    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);

+    // init sampler
+    const float top_k = 5;
+    const float top_p = 0.80f;
+    const float temp  = 0.30f;
+
+    const int seed = 0;
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    if (temp > 0.0f) {
+        llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
+        llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
+        llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
+        llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
+    } else {
+        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    }
+
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@ -417,7 +436,7 @@ int main(int argc, char ** argv) {

            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
@ -700,54 +719,13 @@ int main(int argc, char ** argv) {

                    {
                        // out of user input, sample next token
-                        const float top_k          = 5;
-                        const float top_p          = 0.80f;
-                        const float temp           = 0.30f;
-                        const float repeat_penalty = 1.1764f;
-
-                        const int repeat_last_n    = 256;

                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
-                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                            llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }

-                        llama_token id = 0;
-
-                        {
-                            auto logits = llama_get_logits(ctx_llama);
-                            auto n_vocab = llama_n_vocab(model_llama);
-
-                            logits[llama_token_eos(model_llama)] = 0;
-
-                            std::vector<llama_token_data> candidates;
-                            candidates.reserve(n_vocab);
-                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                            }
-
-                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                            // apply repeat penalty
-                            const float nl_logit = logits[llama_token_nl(model_llama)];
-
-                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
-                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
-
-                            logits[llama_token_nl(model_llama)] = nl_logit;
-
-                            if (temp <= 0) {
-                                // Greedy sampling
-                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
-                            } else {
-                                // Temperature sampling
-                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
-                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-                                llama_sample_temp (ctx_llama, &candidates_p, temp);
-                                id = llama_sample_token(ctx_llama, &candidates_p);
-                            }
-                        }
+                        const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);

                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
@ -797,8 +775,14 @@ int main(int argc, char ** argv) {
    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);

-    llama_print_timings(ctx_llama);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx_llama);
+
+    llama_sampler_free(smpl);
+    llama_batch_free(batch);
    llama_free(ctx_llama);

+    llama_backend_free();
+
    return 0;
 }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@ -1,17 +1,20 @@
 #pragma once

 #include <cstdint>
-#include <map>
-#include <utility>
 #include <vector>
+#include <unordered_map>
+#include <unordered_set>

-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
-extern const std::multimap<uint32_t, uint32_t>          unicode_map_nfd;
-extern const std::map<char32_t, char32_t>               unicode_map_lowercase;
+struct range_nfd {
+    uint32_t first;
+    uint32_t last;
+    uint32_t nfd;
+};
+
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@ -1,6 +1,11 @@
-#include "unicode.h"
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
+#include "unicode.h"
 #include "unicode-data.h"

+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@ -15,6 +20,12 @@
 #include <locale>
 #include <codecvt>

+size_t unicode_len_utf8(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
@ -23,7 +34,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }

-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
@ -109,57 +120,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 //    return result;
 //}

-static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
-    std::unordered_map<uint32_t, int> cpt_types;
-    for (auto p : unicode_ranges_number) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_NUMBER;
+static std::vector<codepoint_flags> unicode_cpt_flags_array() {
+    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
+
+    assert (unicode_ranges_flags.begin()[0].first == 0);
+    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
+    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
+        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
+        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
+        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
+            cpt_flags[cpt] = range_ini.second;
        }
    }
-    for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_LETTER;
-        }
+
+    for (auto cpt : unicode_set_whitespace) {
+        cpt_flags[cpt].is_whitespace = true;
    }
-    for (auto p : unicode_ranges_separator) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
-        }
+
+    for (auto p : unicode_map_lowercase) {
+        cpt_flags[p.second].is_lowercase = true;
    }
-    for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
-        }
+
+    for (auto p : unicode_map_uppercase) {
+        cpt_flags[p.second].is_uppercase = true;
    }
-    for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
-        }
+
+    for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
+        cpt_flags[range.nfd].is_nfd = true;
    }
-    for  (auto p : unicode_ranges_symbol) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
-        }
-    }
-    for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_CONTROL;
-        }
-    }
-    return cpt_types;
+
+    return cpt_flags;
 }

 static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
    std::unordered_map<uint8_t, std::string> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
+    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
+    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
+    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
@ -175,15 +178,15 @@ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {

 static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
    std::unordered_map<std::string, uint8_t> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
+    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
+    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
+    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
@ -234,12 +237,13 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        assert(offset_end <= cpts.size());
        start = offset_end;

-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };

-        auto _get_cpt_type = [&] (const size_t pos) -> int {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
        };

        size_t _prev_end = offset_ini;
@ -260,18 +264,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        };

        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
-            const int cpt_type = _get_cpt_type(pos);
+            const uint32_t cpt = _get_cpt(pos);
+            const auto flags = _get_flags(pos);

            // regex: 's|'t|'re|'ve|'m|'ll|'d
            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = _get_cpt(pos+1);
+                uint32_t cpt_next = _get_cpt(pos+1);
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = _get_cpt(pos+2);
+                    uint32_t cpt_next_next = _get_cpt(pos+2);
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
@ -281,44 +285,42 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                }
            }

-            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
-            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
+            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
            // regex: <space>?\p{L}+
-            if (cpt2_type == CODEPOINT_TYPE_LETTER) {
+            if (flags2.is_letter) {
                pos += (cpt == ' ');
-                while (cpt2_type == CODEPOINT_TYPE_LETTER) {
-                    cpt2_type = _get_cpt_type(++pos);
+                while (flags2.is_letter) {
+                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?\p{N}+
-            if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
+            if (flags2.is_number) {
                pos += (cpt == ' ');
-                while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
-                    cpt2_type = _get_cpt_type(++pos);
+                while (flags2.is_number) {
+                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
                pos += (cpt == ' ');
-                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                    cpt2_type = _get_cpt_type(++pos);
-                    cpt2 = _get_cpt(pos);
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }

            size_t num_whitespaces = 0;
-            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
+            while (_get_flags(pos+num_whitespaces).is_whitespace) {
                num_whitespaces++;
            }

            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
@ -353,12 +355,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        assert(offset_end <= cpts.size());
        start = offset_end;

-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };

-        auto _get_cpt_type = [&] (const size_t pos) -> int {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
        };

        size_t _prev_end = offset_ini;
@ -379,18 +382,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        };

        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
-            const int cpt_type = _get_cpt_type(pos);
+            const uint32_t cpt = _get_cpt(pos);
+            const auto flags = _get_flags(pos);

            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+                uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+                    uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
@ -400,11 +403,11 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
                }
            }

-            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
-            if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
-                if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) {  // one or more letters
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+
+            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
+                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
                    pos++;
-                    while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
+                    while (_get_flags(pos).is_letter) {
                        pos++;
                    }
                    _add_token(pos);
@ -413,9 +416,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            }

            // regex: \p{N}{1,3}
-            if (cpt_type == CODEPOINT_TYPE_NUMBER) {
+            if (flags.is_number) {
                size_t ini = pos;
-                while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
+                while (_get_flags(pos).is_number) {
                    if (++pos - ini >= 3 ) {
                        _add_token(pos);
                        ini = pos;
@ -426,14 +429,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            }

            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
-            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
-            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
-            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
                pos += (cpt == ' ');
-                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                    cpt2_type = _get_cpt_type(++pos);
-                    cpt2 = _get_cpt(pos);
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                    flags2 = _get_flags(++pos);
                }
+                uint32_t cpt2 = _get_cpt(pos);
                while (cpt2 == '\r' || cpt2 == '\n') {
                    cpt2 = _get_cpt(++pos);
                }
@ -443,8 +445,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &

            size_t num_whitespaces = 0;
            size_t last_end_r_or_n = 0;
-            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
-                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
+            while (_get_flags(pos+num_whitespaces).is_whitespace) {
+                uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
                if (cpt2 == '\r' || cpt2 == '\n') {
                    last_end_r_or_n = pos + num_whitespaces + 1;
                }
@ -459,7 +461,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            }

            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
@ -589,21 +591,21 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
 }

 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
-    std::vector<uint32_t> result;
-    result.reserve(cpts.size());
+    auto comp = [] (const uint32_t cpt, const range_nfd & range) {
+        return cpt < range.first;
+    };
+    std::vector<uint32_t> result(cpts.size());
    for (size_t i = 0; i < cpts.size(); ++i) {
-        auto it = unicode_map_nfd.find(cpts[i]);
-        if (it == unicode_map_nfd.end()) {
-            result.push_back(cpts[i]);
-        } else {
-            result.push_back(it->second);
-        }
+        const uint32_t cpt = cpts[i];
+        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
+        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
    }
    return result;
 }

 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
+    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(unicode_cpt_from_utf8(utf8, offset));
@ -611,31 +613,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    return result;
 }

-int unicode_cpt_type(uint32_t cp) {
-    static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
-    const auto it = cpt_types.find(cp);
-    return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
+codepoint_flags unicode_cpt_flags(const uint32_t cp) {
+    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+    static const auto cpt_flags = unicode_cpt_flags_array();
+    return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
 }

-int unicode_cpt_type(const std::string & utf8) {
-    if (utf8.length() == 0) {
-        return CODEPOINT_TYPE_UNIDENTIFIED;
+codepoint_flags unicode_cpt_flags(const std::string & utf8) {
+    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+    if (utf8.empty()) {
+        return undef;  // undefined
    }
    size_t offset = 0;
-    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
-}
-
-bool unicode_cpt_is_whitespace(uint32_t cp) {
-    static const std::unordered_set<uint32_t> is_whitespace = [] {
-        std::unordered_set<uint32_t> is_whitespace;
-        for (auto p : unicode_ranges_whitespace) {
-            for (auto i = p.first; i <= p.second; ++i) {
-                is_whitespace.insert(i);
-            }
-        }
-        return is_whitespace;
-    }();
-    return (bool)is_whitespace.count(cp);
+    return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
 }

 std::string unicode_byte_to_utf8(uint8_t byte) {
@ -648,29 +638,36 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
    return map.at(utf8);
 }

-char32_t unicode_tolower(char32_t cp) {
-    auto it = unicode_map_lowercase.find(cp);
-    return it == unicode_map_lowercase.end() ? cp : it->second;
+uint32_t unicode_tolower(uint32_t cp) {
+    // binary search
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+        [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
+            return pair.first < value;
+        });
+    if (it != unicode_map_lowercase.end() && it->first == cp) {
+        return it->second;
+    }
+    return cp;  // Return the original code point if no lowercase mapping is found
 }

 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
-        { "\\p{N}", CODEPOINT_TYPE_NUMBER },
-        { "\\p{L}", CODEPOINT_TYPE_LETTER },
-        { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
+        { "\\p{N}", codepoint_flags::NUMBER },
+        { "\\p{L}", codepoint_flags::LETTER },
+        { "\\p{P}", codepoint_flags::PUNCTUATION },
    };

    static const std::map<int, int> k_ucat_cpt = {
-        { CODEPOINT_TYPE_NUMBER,        0xD1 },
-        { CODEPOINT_TYPE_LETTER,        0xD2 },
-        { CODEPOINT_TYPE_PUNCTUATION,   0xD3 },
+        { codepoint_flags::NUMBER,        0xD1 },
+        { codepoint_flags::LETTER,        0xD2 },
+        { codepoint_flags::PUNCTUATION,   0xD3 },
    };

    static const std::map<int, std::string> k_ucat_map = {
-        { CODEPOINT_TYPE_NUMBER,        "\x30-\x39" }, // 0-9
-        { CODEPOINT_TYPE_LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
-        { CODEPOINT_TYPE_PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+        { codepoint_flags::NUMBER,        "\x30-\x39" }, // 0-9
+        { codepoint_flags::LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
+        { codepoint_flags::PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
    };

    // compute collapsed codepoints only if needed by at least one regex
@ -701,10 +698,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                continue;
            }

-            const int cpt_type = unicode_cpt_type(cpts[i]);
+            const auto flags = unicode_cpt_flags(cpts[i]);

-            if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
-                text_collapsed[i] = k_ucat_cpt.at(cpt_type);
+            if (flags.is_whitespace) {
+                //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
+                //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
+                text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
+            } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
+                text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
            } else {
                text_collapsed[i] = (char) 0xD0; // fallback
            }
@ -788,9 +789,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
            } else {
                // no unicode category used, we can use std::wregex directly
-                const std::wstring wtext       = unicode_wstring_from_utf8(text);
                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);

+                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
+                std::wstring wtext(cpts.begin(), cpts.end());
+                for (size_t i = 0; i < wtext.size(); ++i) {
+                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+                        wtext[i] = 0x0B;
+                    }
+                }
+
                //printf("text: %s\n", text.c_str());
                //printf("regex_expr: %s\n", regex_expr.c_str());
                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -4,28 +4,64 @@
 #include <string>
 #include <vector>

-#define CODEPOINT_TYPE_UNIDENTIFIED 0
-#define CODEPOINT_TYPE_NUMBER       1
-#define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_SEPARATOR    3
-#define CODEPOINT_TYPE_ACCENT_MARK  4
-#define CODEPOINT_TYPE_PUNCTUATION  5
-#define CODEPOINT_TYPE_SYMBOL       6
-#define CODEPOINT_TYPE_CONTROL      7
+// TODO: prefix all symbols with "llama_"
+
+struct codepoint_flags {
+    enum {
+        UNDEFINED       = 0x0001,
+        NUMBER          = 0x0002,  // regex: \p{N}
+        LETTER          = 0x0004,  // regex: \p{L}
+        SEPARATOR       = 0x0008,  // regex: \p{Z}
+        ACCENT_MARK     = 0x0010,  // regex: \p{M}
+        PUNCTUATION     = 0x0020,  // regex: \p{P}
+        SYMBOL          = 0x0040,  // regex: \p{S}
+        CONTROL         = 0x0080,  // regex: \p{C}
+        MASK_CATEGORIES = 0x00FF,
+    };
+
+    // codepoint type
+    uint16_t is_undefined   : 1;
+    uint16_t is_number      : 1;  // regex: \p{N}
+    uint16_t is_letter      : 1;  // regex: \p{L}
+    uint16_t is_separator   : 1;  // regex: \p{Z}
+    uint16_t is_accent_mark : 1;  // regex: \p{M}
+    uint16_t is_punctuation : 1;  // regex: \p{P}
+    uint16_t is_symbol      : 1;  // regex: \p{S}
+    uint16_t is_control     : 1;  // regex: \p{C}
+    // helper flags
+    uint16_t is_whitespace  : 1;  // regex: \s
+    uint16_t is_lowercase   : 1;
+    uint16_t is_uppercase   : 1;
+    uint16_t is_nfd         : 1;
+
+    // decode from uint16
+    inline codepoint_flags(const uint16_t flags=0) {
+        *reinterpret_cast<uint16_t*>(this) = flags;
+    }
+
+    inline uint16_t as_uint() const {
+        return *reinterpret_cast<const uint16_t*>(this);
+    }
+
+    inline uint16_t category_flag() const {
+        return this->as_uint() & MASK_CATEGORIES;
+    }
+};
+
+size_t unicode_len_utf8(char src);

 std::string unicode_cpt_to_utf8(uint32_t cp);
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);

 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

-int unicode_cpt_type(uint32_t cp);
-int unicode_cpt_type(const std::string & utf8);
-
-bool unicode_cpt_is_whitespace(uint32_t cp);
+codepoint_flags unicode_cpt_flags(const uint32_t cp);
+codepoint_flags unicode_cpt_flags(const std::string & utf8);

 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);

-char32_t unicode_tolower(char32_t cp);
+uint32_t unicode_tolower(uint32_t cp);

 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -72,7 +72,7 @@ struct gpt2_model {
 };

 // load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
    printf("%s: loading model from '%s'\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
@ -380,7 +380,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - embd_w:    the predicted logits for the next token
 //
 // TODO: sync latest version from ggml repo
-bool gpt2_eval(
+static bool gpt2_eval(
        const gpt2_model & model,
        const int n_threads,
        const int n_past,
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -44,7 +44,7 @@ struct whisper_params {

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

@ -109,7 +109,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

-std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+static std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

    prob = 0.0f;
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
@ -5,15 +5,15 @@ project(whisper.cpp)
 set(CMAKE_CXX_STANDARD 11)
 set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)

-set(
-        SOURCE_FILES
-        ${WHISPER_LIB_DIR}/ggml.c
-        ${WHISPER_LIB_DIR}/ggml-alloc.c
-        ${WHISPER_LIB_DIR}/ggml-backend.c
-        ${WHISPER_LIB_DIR}/ggml-quants.c
-        ${WHISPER_LIB_DIR}/whisper.cpp
-        ${CMAKE_SOURCE_DIR}/jni.c
-)
+set(SOURCE_FILES
+    ${WHISPER_LIB_DIR}/ggml/src/ggml.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
+    ${WHISPER_LIB_DIR}/src/whisper.cpp
+    ${CMAKE_SOURCE_DIR}/jni.c
+    )

 find_library(LOG_LIB log)

@ -41,7 +41,6 @@ function(build_library target_name)
        #target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
        #target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
        #target_link_options(${target_name} PRIVATE -flto)
-
    endif ()
 endfunction()

@ -54,3 +53,7 @@ elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
 endif ()

 include_directories(${WHISPER_LIB_DIR})
+include_directories(${WHISPER_LIB_DIR}/src)
+include_directories(${WHISPER_LIB_DIR}/include)
+include_directories(${WHISPER_LIB_DIR}/ggml/include)
+include_directories(${WHISPER_LIB_DIR}/ggml/src)
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -12,47 +12,3 @@ To use:
 (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)

 <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
-
-## CLBlast
-
-> [!NOTE]
-> - OpenCL does not have the same level of support as CUDA or Metal.
-> - Turning on CLBlast may degrade OpenCL performance if your device isn't already tuned. See [tuning.md](https://github.com/CNugteren/CLBlast/blob/162783a414969464ce3aa5adf5c2554afa5ee93e/doc/tuning.md#already-tuned-for-devices) for a list of devices that are already tuned and what to do if yours is missing.
-
-Build CLBlast.
-
-```
-# In path/to/CLBlast (we assume OpenCL-Headers relative location)
-$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-    -DCMAKE_SYSTEM_NAME=Android \
-    -DCMAKE_SYSTEM_VERSION=33 \
-    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-    -DCMAKE_ANDROID_STL_TYPE=c++_static \
-    -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
-    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-# Build libclblast.so
-make -j4
-```
-
-Pull `libGLES_mali.so` to `libOpenCL.so`.
-
-```bash
-# In path/to/whisper.android
-mkdir lib/src/main/jniLibs/arm64-v8a
-adb pull /system/vendor/lib64/egl/libGLES_mali.so lib/src/main/jniLibs/arm64-v8a/libOpenCL.so
-```
-
-In gradle.properties, set `GGML_HOME` to the location of GGML, as well as
-required options for turning on CLBlast.
-
-```
-GGML_HOME=/path/to/ggml
-GGML_CLBLAST=ON
-CLBLAST_HOME=/path/to/CLBlast
-OPENCL_LIB=/path/to/libOpenCL.so
-OPENCL_ROOT=/path/to/OpenCL-Headers
-```
-
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -10,7 +10,7 @@ option(GGML_HOME       "whisper: Path to external GGML source" OFF)

 set(
    SOURCE_FILES
-    ${WHISPER_LIB_DIR}/whisper.cpp
+    ${WHISPER_LIB_DIR}/src/whisper.cpp
    ${CMAKE_SOURCE_DIR}/jni.c
    )

@ -18,10 +18,11 @@ if (NOT GGML_HOME)
    set(
        SOURCE_FILES
        ${SOURCE_FILES}
-        ${WHISPER_LIB_DIR}/ggml.c
-        ${WHISPER_LIB_DIR}/ggml-alloc.c
-        ${WHISPER_LIB_DIR}/ggml-backend.c
-        ${WHISPER_LIB_DIR}/ggml-quants.c
+        ${WHISPER_LIB_DIR}/ggml/src/ggml.c
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
        )
 endif()

@ -75,3 +76,7 @@ endif ()
 build_library("whisper") # Default target

 include_directories(${WHISPER_LIB_DIR})
+include_directories(${WHISPER_LIB_DIR}/src)
+include_directories(${WHISPER_LIB_DIR}/include)
+include_directories(${WHISPER_LIB_DIR}/ggml/include)
+include_directories(${WHISPER_LIB_DIR}/ggml/src)
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -7,9 +7,9 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
 		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
 		1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
-		184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
 		18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
 		18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
@ -20,7 +20,9 @@
 		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
 		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
-		18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
+		18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
+		18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
+		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@ -29,23 +31,26 @@
 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */
-		184447202AB21B25007D6BFE /* CopyFiles */ = {
+		184447202AB21B25007D6BFE /* Copy Files */ = {
 			isa = PBXCopyFilesBuildPhase;
 			buildActionMask = 2147483647;
 			dstPath = "";
 			dstSubfolderSpec = 7;
 			files = (
-				184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */,
+				18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */,
 			);
+			name = "Copy Files";
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXCopyFilesBuildPhase section */

 /* Begin PBXFileReference section */
-		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml-alloc.c"; sourceTree = "<group>"; };
-		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml-alloc.h"; sourceTree = "<group>"; };
-		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml-metal.m"; sourceTree = "<group>"; };
-		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml-metal.metal"; sourceTree = "<group>"; };
+		18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
+		18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
+		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
+		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
+		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
+		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -58,17 +63,19 @@
 		18627C8829052BE000BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		18627C8A29052BE000BD2A04 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		18627C8B29052BE000BD2A04 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
-		18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../whisper.h; sourceTree = "<group>"; };
-		18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../whisper.cpp; sourceTree = "<group>"; };
-		18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
-		18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
+		18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../include/whisper.h; sourceTree = "<group>"; };
+		18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../src/whisper.cpp; sourceTree = "<group>"; };
+		18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml/src/ggml.c; sourceTree = "<group>"; };
+		18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml/include/ggml.h; sourceTree = "<group>"; };
 		18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
-		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml-quants.h"; sourceTree = "<group>"; };
-		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml-backend.h"; sourceTree = "<group>"; };
-		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml-backend-impl.h"; sourceTree = "<group>"; };
-		18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml-backend.c"; sourceTree = "<group>"; };
-		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml-impl.h"; sourceTree = "<group>"; };
-		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml-quants.c"; sourceTree = "<group>"; };
+		18A275FE2C2A94DE001C8D37 /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../../ggml/include/ggml-metal.h"; sourceTree = "<group>"; };
+		18A275FF2C2A9563001C8D37 /* ggml-common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-common.h"; path = "../../../ggml/src/ggml-common.h"; sourceTree = "<group>"; };
+		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
+		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
+		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
+		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
+		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
+		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
 		7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
 		7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -108,8 +115,12 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
+				18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
+				18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
+				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
+				18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
 				18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
-				18ABE1572AF556340044A204 /* ggml-backend.c */,
+				18ABE1572AF556340044A204 /* ggml-backend.cpp */,
 				18ABE1552AF556340044A204 /* ggml-backend.h */,
 				18ABE1582AF556340044A204 /* ggml-impl.h */,
 				18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -151,7 +162,7 @@
 				7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */,
 			);
 			name = coreml;
-			path = ../../../coreml;
+			path = ../../../src/coreml;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
@ -164,7 +175,7 @@
 				18627C7229052BDF00BD2A04 /* Sources */,
 				18627C7329052BDF00BD2A04 /* Frameworks */,
 				18627C7429052BDF00BD2A04 /* Resources */,
-				184447202AB21B25007D6BFE /* CopyFiles */,
+				184447202AB21B25007D6BFE /* Copy Files */,
 			);
 			buildRules = (
 			);
@ -182,7 +193,7 @@
 			isa = PBXProject;
 			attributes = {
 				BuildIndependentTargetsInParallel = 1;
-				LastUpgradeCheck = 1400;
+				LastUpgradeCheck = 1540;
 				TargetAttributes = {
 					18627C7529052BDF00BD2A04 = {
 						CreatedOnToolsVersion = 14.0.1;
@ -212,6 +223,7 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */,
 				18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */,
 				7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */,
 				18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */,
@ -229,13 +241,14 @@
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
 				18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
+				18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
 				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
 				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
+				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
 				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
@ -301,6 +314,7 @@
 				DEBUG_INFORMATION_FORMAT = dwarf;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_DYNAMIC_NO_PIC = NO;
 				GCC_NO_COMMON_BLOCKS = YES;
@ -359,6 +373,7 @@
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				ENABLE_NS_ASSERTIONS = NO;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_NO_COMMON_BLOCKS = YES;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@ -400,6 +415,7 @@
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
+				MTL_HEADER_SEARCH_PATHS = "";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
@ -428,6 +444,7 @@
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
+				MTL_HEADER_SEARCH_PATHS = "";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
@ -15,7 +15,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    private var audioPlayer: AVAudioPlayer?
    
    private var modelUrl: URL? {
-        Bundle.main.url(forResource: "ggml-tiny.en", withExtension: "bin", subdirectory: "models")
+        Bundle.main.url(forResource: "ggml-base.en", withExtension: "bin", subdirectory: "models")
    }
    
    private var sampleUrl: URL? {
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -1,141 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // buffer that contains a collection of buffers
-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-
-        void (*GGML_CALL free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations
-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan (not used currently)
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
-        // even if the weight has to be copied from the CPU temporarily
-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // (optional) event synchronization
-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
-    };
-
-    struct ggml_backend {
-        ggml_guid_t guid;
-
-        struct ggml_backend_i iface;
-        ggml_backend_context_t context;
-    };
-
-    struct ggml_backend_event {
-        ggml_backend_t backend;
-        void * context;
-    };
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/Show More
+++ b/Show More