whisper : try to fix the parallel whisper_state functionality (#1479 )

* whisper : try to fix the parallel whisper_state functionality * whisper : fix multi-state Metal * whisper : free backend instances in whisper_state
whisper : fix UB with measure buffers
2025-06-24 17:15:19 +00:00 · 2023-11-12 14:52:38 +02:00 · 2023-11-11 18:35:23 +02:00 · 2023-11-11 17:39:30 +02:00 · 2023-11-11 17:06:21 +02:00 · 2023-11-11 13:04:58 +02:00
294 changed files with 18690 additions and 95745 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -1,40 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.3.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-WORKDIR /app
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV WHISPER_CUBLAS=1
-
-RUN apt-get update && \
-    apt-get install -y build-essential \
-    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-# Ref: https://stackoverflow.com/a/53464012
-ENV CUDA_MAIN_VERSION=12.3
-ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
-
-COPY .. .
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-ENV CUDA_MAIN_VERSION=12.3
-ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY --from=build /app /app
-ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -1,19 +0,0 @@
-FROM ubuntu:22.04 AS build
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y build-essential \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY .. .
-RUN make
-
-FROM ubuntu:22.04 AS runtime
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY --from=build /app /app
-ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -25,7 +25,6 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential libsdl2-dev
            make
@ -87,7 +86,6 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
@ -115,9 +113,8 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
-            apt install -y clang build-essential cmake libsdl2-dev
+            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
            ctest -L gh --output-on-failure'
@ -143,171 +140,12 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential cmake
            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
            make
            ctest -L gh --output-on-failure'

-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    strategy:
-      fail-fast: false
-      matrix:
-        dwhisper_sycl: [ON]
-        dcmake_c_compiler: [icx]
-        dcmake_cxx_compiler: [icpx]
-        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-          cmake --build . --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    strategy:
-      fail-fast: false
-      matrix:
-        dwhisper_sycl: [ON]
-        dcmake_c_compiler: [icx]
-        dcmake_cxx_compiler: [icpx]
-        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-          cmake --build . --config Release -j $(nproc)
-
-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-SDL2
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using make
-        shell: msys2 {0}
-        run: |
-            make -j $(nproc)
-
-      - name: Clean after building using make
-        shell: msys2 {0}
-        run: |
-            make clean
-
-      - name: Build using make w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            make WHISPER_OPENBLAS=1 -j $(nproc)
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DWHISPER_OPENBLAS=ON
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
  windows:
    runs-on: windows-latest

@ -324,7 +162,7 @@ jobs:
            s2arc: x64
            jnaPath: win32-x86-64
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -379,16 +217,13 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x86.zip
            s2arc: x86
-            clblast: OFF
          - arch: x64
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x64.zip
            s2arc: x64
-            clblast: ON
-            clver: 1.6.1
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -413,18 +248,6 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

-      - name: Install OpenCL
-        if: matrix.clblast == 'ON'
-        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
-
-      - name: Fetch CLBlast and set CLBlast_DIR
-        if: matrix.clblast == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
-          7z x clblast.zip
-          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
-          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
-
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
@ -432,7 +255,6 @@ jobs:
          -DWHISPER_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
-          -DWHISPER_CLBLAST=${{ matrix.clblast }}

      - name: Build
        run: |
@ -447,15 +269,11 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

-      - name: Copy clblast.dll
-        if: matrix.clblast == 'ON'
-        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
-
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
+          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  windows-cublas:
@ -467,12 +285,11 @@ jobs:
        arch: [x64]
        cublas: [ON]
        sdl2: [ON]
-        cuda-toolkit: [12.2.0, 11.8.0]
        include:
          - arch: x64
            s2arc: x64
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -483,9 +300,7 @@ jobs:

      - name: Install CUDA Toolkit
        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.11
-        with:
-          cuda: '${{ matrix.cuda-toolkit }}'
+        uses: Jimver/cuda-toolkit@v0.2.10

      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -498,20 +313,12 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUBLAS=${{ matrix.cublas }}
-          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_CUBLAS=1

-      - name: Build ${{ matrix.cuda-toolkit }}
+      - name: Build
        run: |
          cd ./build
-          cmake --build . --config ${{ matrix.build }}
-
-      - name: Copy CUDA DLLs
-        run: >
-          Copy-Item -PassThru
-          -Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
-          -Include cudart64_*,cublas64_*,cublasLt64_*
-          -Destination build/bin/${{ matrix.build }}
+          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
@ -521,7 +328,7 @@ jobs:
        if: matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
+          name: whisper-cublas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  emscripten:
@ -574,14 +381,6 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v3
-        with:
-          path: whisper
-
-      - name: Clone
-        uses: actions/checkout@v3
-        with:
-          repository: ggerganov/ggml
-          path: ggml

      - name: Install Java
        uses: actions/setup-java@v3
@ -594,41 +393,9 @@ jobs:

      - name: Build
        run: |
-          cd whisper/examples/whisper.android
+          cd examples/whisper.android
          ./gradlew assembleRelease --no-daemon

-      - name: Build with external ggml
-        run: |
-          export PATH_TO_GGML=$PWD/ggml
-          cd whisper/examples/whisper.android
-          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
-
-  android_java:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: set up JDK 11
-        uses: actions/setup-java@v3
-        with:
-          java-version: '11'
-          distribution: 'temurin'
-          cache: gradle
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v2
-        with:
-          api-level: 30
-          build-tools-version: 30.0.3
-
-      - name: Build
-        run: |
-          cd examples/whisper.android.java
-          chmod +x ./gradlew
-          ./gradlew assembleRelease
-
  java:
    needs: [ 'windows' ]
    runs-on: windows-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -1,57 +0,0 @@
-name: Publish Docker image
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
-
-    runs-on: ubuntu-latest
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      matrix:
-        config:
-          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
-
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v3
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push Docker image (versioned)
-        if: github.event_name == 'push'
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
-          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,6 @@
 .vs/
 .vscode/
 .DS_Store
-.vimspector.json

 build/
 build-coreml/
@ -32,7 +31,6 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
-/server
 /lsp

 arm_neon.h
@ -56,7 +54,3 @@ bindings/java/.idea/
 .idea/

 benchmark_results.csv
-cmake-build-debug/
-.cxx/
-.gradle/
-local.properties
--- a/301
+++ b/301
@ -1,301 +0,0 @@
-# date: Tue Apr  9 20:27:03 EEST 2024
-# this file is auto-generated by scripts/gen-authors.sh
-
-0/0 <zero@imaskeleton.me>
-0cc4m <picard12@live.de>
-0xsourcecode <134374803+0xsourcecode@users.noreply.github.com>
-AT <manyoso@users.noreply.github.com>
-Aarni Koskela <akx@iki.fi>
-Aaron Pham <29749331+aarnphm@users.noreply.github.com>
-Aaron Taylor <aaron@exphat.com>
-Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
-Abitofevrything <54505189+abitofevrything@users.noreply.github.com>
-AfryMask <AfryMask@163.com>
-Ahmad Bilal <ahmad.bilal@empglabs.com>
-AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
-Akash Mahajan <akash7190@gmail.com>
-Akash Mahajan <akashmjn@stanford.edu>
-Al Hoang <3811822-hoanga@users.noreply.gitlab.com>
-Alan <unknown>
-Aleksander Andrzejewski <18704749+aleksanderandrzejewski@users.noreply.github.com>
-Alex Azarov <alex@azarov.by>
-Alex Bacart <13940752+alex-bacart@users.noreply.github.com>
-Alex Evgrashin <aevgrashin@yandex.ru>
-Alexandr Graschenkov <alexandr.graschenkov91@gmail.com>
-Alexandru Mariuti <alex@mariuti.com>
-Alexey Kharlamov <alexey@kharlamov.biz>
-Alfredo Montesinos <alfredo.montesinos@g.austincc.edu>
-Ali Alameh <ali.alameh@isae.edu.lb>
-Ananta Bastola <anantarajbastola@gmail.com>
-Andreu Huguet <andreuhuguet@gmail.com>
-Andrew Huynh <a5thuynh@gmail.com>
-Andrew S <andrews54757@gmail.com>
-Andy Maloney <asmaloney@gmail.com>
-Anton Kostin <masguit42@users.noreply.github.com>
-Artyom Mezin <psycho.fading@gmail.com>
-Asad Memon <asad.lionpk@gmail.com>
-Ashraful Islam <ashraful.meche@gmail.com>
-AsukaMinato <asukaminato@nyan.eu.org>
-AustinMroz <austinmroz@utexas.edu>
-Avik Sengupta <avik@sengupta.net>
-Bader-eddine Ouaich <49657842+baderouaich@users.noreply.github.com>
-Baffin Lee <baffinlee@gmail.com>
-Ben Nortier <bjnortier@gmail.com>
-Benjamin Heiniger <benjamin.heiniger@bluewin.ch>
-Bo-Yi Wu <appleboy.tw@gmail.com>
-Boris Bliznioukov <blib@mail.com>
-Borislav Stanimirov <b.stanimirov@abv.bg>
-Brad Murray <59848399+bradmurray-dt@users.noreply.github.com>
-Brian Murray <brian@bmurray.ca>
-CRD716 <crd716@gmail.com>
-Canis Lupus <Canis-UK@users.noreply.github.com>
-Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
-ChangSeok Oh <shivamidow@users.noreply.github.com>
-Chaoqun <27287694+OpenWaygate@users.noreply.github.com>
-Chia-Hsiang Cheng <88014292+garychia@users.noreply.github.com>
-Chidi Williams <williamschidi1@gmail.com>
-Christian <12550267+iceychris@users.noreply.github.com>
-Clifford Heath <clifford.heath@gmail.com>
-Colin <github@whoisc.cc>
-DGdev91 <DGdev91@users.noreply.github.com>
-Damian Czaja <trojan295@protonmail.com>
-Daniel Bevenius <daniel.bevenius@gmail.com>
-David <dnhkng@gmail.com>
-David Thorpe <djt@mutablelogic.com>
-Davidson Francis <davidsondfgl@gmail.com>
-Dener Stassun <denerstassun@gmail.com>
-Didzis Gosko <didzis@users.noreply.github.com>
-Digipom <admin@digipom.com>
-Dimo <dimo@ieee.org>
-Dody Suria Wijaya <dodysw@gmail.com>
-Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
-Duncan McConnell <ddmcconnell4@gmail.com>
-Egor Egorov <me@egorfine.com>
-Elkana Bardugo <ttv200@gmail.com>
-Emmanuel Schmidbauer <eschmidbauer@gmail.com>
-Engininja2 <139037756+Engininja2@users.noreply.github.com>
-Eric Swanson <eswanson@alloscomp.com>
-Eric Tendian <erictendian@gmail.com>
-Erik Scholz <Green-Sky@users.noreply.github.com>
-Evan Jones <evan.q.jones@gmail.com>
-Evan Martin <evan.martin@gmail.com>
-Eve <139727413+netrunnereve@users.noreply.github.com>
-Evgeny Kuznetsov <evgeny@kuznetsov.md>
-F1L1P <78918286+F1L1Pv2@users.noreply.github.com>
-Fangjun Kuang <csukuangfj@gmail.com>
-Felix <stenbackfelix@gmail.com>
-Finn Voorhees <finnvoorhees@gmail.com>
-FlippFuzz <41221030+FlippFuzz@users.noreply.github.com>
-Gang Chen <goncha@gmail.com>
-Gavin Cai <gavin1818@hotmail.com>
-George Hindle <george@georgehindle.com>
-Georgi Gerganov <ggerganov@gmail.com>
-GitAritron <103900385+GitAritron@users.noreply.github.com>
-GiviMAD <GiviMAD@users.noreply.github.com>
-Gleicon Moraes <gleicon@gmail.com>
-Gregor Jasny <gjasny@googlemail.com>
-Guillaume Wenzek <gwenzek@users.noreply.github.com>
-HY. Kelvin Lee <34256578+hykelvinlee42@users.noreply.github.com>
-Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
-Hang <bebound@gmail.com>
-Herman Semenov <GermanAizek@yandex.ru>
-Hrishikesh Barman <geekodour@users.noreply.github.com>
-Ian Bicking <ian@ianbicking.org>
-Ian Bull <irbull@eclipsesource.com>
-Ikko Ashimine <eltociear@gmail.com>
-InconsolableCellist <23345188+InconsolableCellist@users.noreply.github.com>
-Ismatulla Mansurov <47342870+sapoepsilon@users.noreply.github.com>
-Ivan Gorin <ivangorin21@gmail.com>
-JJ <103335846+computerscienceiscool@users.noreply.github.com>
-Jack Mousseau <jmousseau@users.noreply.github.com>
-JacobLinCool <jacoblincool@gmail.com>
-Jakub Ráček <blizzcz@gmail.com>
-Jared Van Bortel <jared@nomic.ai>
-Jay Binks <jaybinks@gmail.com>
-Jhen-Jie Hong <developer@jhen.me>
-Jhen-Jie Hong <iainst0409@gmail.com>
-JidongZhang-THU <1119708529@qq.com>
-Jo Liss <joliss42@gmail.com>
-Johan <jr.raffin@gmail.com>
-Johannes Gäßler <johannesg@5d6.de>
-John Balis <phobossystems@gmail.com>
-Jonathan Soo <jcsoo@agora.com>
-Jonno <1160532+razodactyl@users.noreply.github.com>
-Joonas Pihlajamaa <joonas.pihlajamaa@iki.fi>
-Jose <34888496+Jerry-Master@users.noreply.github.com>
-Josh Bleecher Snyder <josharian@gmail.com>
-Judd <foldl@users.noreply.github.com>
-Jumper775 <78500318+jumpers775@users.noreply.github.com>
-Justine Tunney <jtunney@gmail.com>
-KP Kaiser <kirk@zothcorp.com>
-Kamilake <exjang0@gmail.com>
-Kartik Saranathan <278928+Kartiku@users.noreply.github.com>
-Kasumi <90275229+kasumi-1@users.noreply.github.com>
-Kawrakow <48489457+ikawrakow@users.noreply.github.com>
-Kevin Brothaler <admin@digipom.com>
-Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
-Kreijstal <rainb@tfwno.gf>
-Kylin <56434533+KyL0N@users.noreply.github.com>
-LBlue <153975653+lbluep@users.noreply.github.com>
-Larry Battle <larry.battle.tech@gmail.com>
-Laytan Laats <laytanlaats@hotmail.com>
-Leo Moll <leo.moll@yeasoft.com>
-Lexevolution <31176843+Lexevolution@users.noreply.github.com>
-LittleLoli <26589867+WhichWho@users.noreply.github.com>
-Lucas Zanek <57494138+LucasZNK@users.noreply.github.com>
-Luis Herrera <herrera-luis@users.noreply.github.com>
-Lukas Rist <glaslos@gmail.com>
-M. A. Ali <73258591+MightyStud@users.noreply.github.com>
-M. Eren Akbiyik <erenakbiyik@gmail.com>
-Maciek <maciek.mab122@gmail.com>
-Marcin Mielniczuk <marmistrz.dev@zoho.eu>
-Martin Warnaar <martinwarnaar@gmail.com>
-Matheus de Sousa <23645013+keyehzy@users.noreply.github.com>
-Mathijs de Bruin <mathijs@mathijsfietst.nl>
-Matija Pevec <mightymatth@users.noreply.github.com>
-Maximiliano Levi <8160966+maxilevi@users.noreply.github.com>
-Meng, Hengyu <hengyu.meng@intel.com>
-Michael Podvitskiy <podvitskiymichael@gmail.com>
-Michael Rienstra <mrienstra@gmail.com>
-Mikhail Grigorev <sleuthhound@gmail.com>
-Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
-Mohit Agarwal <mohit@sdf.org>
-Murilo Santana <mvrilo@gmail.com>
-Neil Chudleigh <nchudleigh@users.noreply.github.com>
-Neo Zhang Jianyu <jianyu.zhang@intel.com>
-Neuman Vong <neuman.vong@gmail.com>
-Nicholas Albion <nalbion@yahoo.com>
-Niels Mayer <Niels.Mayer@gmail.com>
-Okabintaro <103938900+Okabintaro@users.noreply.github.com>
-Oleg Sidorov <me@whitebox.io>
-Oleg Sidorov <oleg@sidorov.nl>
-Ondrej Kokes <ondrej.kokes@gmail.com>
-Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
-Paul Tsochantaris <ptsochantaris@icloud.com>
-Philipp Zabel <philipp.zabel@gmail.com>
-Philippe Normand <phil@base-art.net>
-Przemysław Pawełczyk <przemoc@gmail.com>
-Qianhe Chen <54462604+chenqianhe@users.noreply.github.com>
-Radosław Gryta <radek.gryta@gmail.com>
-Reinforce-II <fate@eastal.com>
-Reinis Muiznieks <muiznieks.reinis@gmail.com>
-RelatedTitle <r3latedtitle@gmail.com>
-RhinoDevel <RhinoDevel@users.noreply.github.com>
-Rich Jones <miserlou@gmail.com>
-Robin <robin.xw@hotmail.com>
-Roddur Dasgupta <roddurd@gmail.com>
-Roland Rabien <figbug@gmail.com>
-Rotem Dan <rotemdan@gmail.com>
-Ryan Hitchman <hitchmanr@gmail.com>
-Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com>
-RyanChang <ftes90015@gmail.com>
-Sam <49637763+Onlyartist9@users.noreply.github.com>
-Sam Pullara <spullara@gmail.com>
-Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
-Sergio López <slp@sinrega.org>
-Siddharth Ramakrishnan <srr2141@columbia.edu>
-Simon Moisselin <simon.moisstoll@gmail.com>
-Sindre Sorhus <sindresorhus@gmail.com>
-Slava Primenko <primenko.s@gmail.com>
-Syahmi Azhar <prsyahmi@gmail.com>
-Syed Jafri <syedjafri97@gmail.com>
-Sơn Phan Trung <phantrungson17@gmail.com>
-Taisei Mima <bhbstar.me@gmail.com>
-Takeshi Inoue <inoue.takeshi@gmail.com>
-Tamotsu Takahashi <ttakah+github@gmail.com>
-Taras Glek <taras@thegp.com>
-Tauseef Mohiuddin <35351464+tauseefmohammed2@users.noreply.github.com>
-Thijs Raymakers <thijs@raymakers.nl>
-Thomas Fitzsimmons <fitzsim@fitzsim.org>
-Tiago Fassoni <tiagofassoni@users.noreply.github.com>
-Tienshiao Ma <tienshiao@tienshiao.org>
-Timothy Cronin <40186632+4imothy@users.noreply.github.com>
-Tobrun <tobrun.van.nuland@gmail.com>
-Todd <taf2@users.noreply.github.com>
-Tong Li <31761981+litongjava@users.noreply.github.com>
-Topping1 <78745143+Topping1@users.noreply.github.com>
-Travis Cline <travis.cline@gmail.com>
-UEXTM.com <84163508+uextm@users.noreply.github.com>
-Vadim Peretokin <vperetokin@hey.com>
-Valentin Gosu <1454649+valenting@users.noreply.github.com>
-Vulcan <93451215+trholding@users.noreply.github.com>
-WhiteOlivierus <36532695+WhiteOlivierus@users.noreply.github.com>
-Xiang (Kevin) Li <kevinli020508@gmail.com>
-Xiao-Yong Jin <jinxiaoyong@gmail.com>
-XiaotaoChen <chenxiaotao1234@gmail.com>
-Yajing Tang <phillis@google.com>
-Yang Shen <aplshenyang@gmail.com>
-Yunès <jean.baptiste.yunes@free.fr>
-ZaBlazzingZephyrus <119159668+blazingzephyr@users.noreply.github.com>
-Zigfrid Zvezdin <ziggerZZ@gmail.com>
-Zollner <24618122+Zolliner@users.noreply.github.com>
-ai-at-home <149282006+ai-at-home@users.noreply.github.com>
-alonfaraj <alonfaraj@gmail.com>
-andypayne <apayne@gmail.com>
-ardfork <134447697+ardfork@users.noreply.github.com>
-automaticcat <daogiatuank54@gmail.com>
-be-next <jerome.ramette@gmail.com>
-bert hubert <bert@hubertnet.nl>
-bmwl <brian.marshall@tolko.com>
-bobqianic <129547291+bobqianic@users.noreply.github.com>
-bocytko <bocytko+github@gmail.com>
-boolemancer <48014766+boolemancer@users.noreply.github.com>
-boolemancer <boolemancer@gmail.com>
-bradmit <151883577+bradmit@users.noreply.github.com>
-brunofaustino <b.fa.amorim@gmail.com>
-bssrdf <merlintiger@hotmail.com>
-byte-6174 <88070277+byte-6174@users.noreply.github.com>
-cdosoftei <ciprian.dosoftei@gmail.com>
-clach04 <Chris.Clark@actian.com>
-compilade <113953597+compilade@users.noreply.github.com>
-conradg <conradjgodfrey@gmail.com>
-ddpasa <112642920+ddpasa@users.noreply.github.com>
-denersc <denerstassun@gmail.com>
-dscripka <dscripka@users.noreply.github.com>
-duthils <duthils@duthils.net>
-ecneladis <ecneladis@users.noreply.github.com>
-faker <nspyia2002@gmail.com>
-fitzsim <fitzsim@fitzsim.org>
-fraxy-v <65565042+fraxy-v@users.noreply.github.com>
-genevera (she/her) <genevera@users.noreply.github.com>
-geniusnut <geniusnut@gmail.com>
-greeshmay <greeshmay@gmail.com>
-hydai <z54981220@gmail.com>
-iamthad <thadeus.j.fleming@gmail.com>
-james wolf <contractorwolf@hotmail.com>
-joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
-jorismertz <35079666+jorismertz@users.noreply.github.com>
-junkfood <69683722+JunkFood02@users.noreply.github.com>
-jwijffels <jwijffels@bnosac.be>
-kamranjon <kamranjon@gmail.com>
-katsu560 <katsu560oo-@docomo.ne.jp>
-kennethge <57784063+kenneth-ge@users.noreply.github.com>
-keyehzy <msamuel@aluno.puc-rio.br>
-leejet <leejet714@gmail.com>
-litong <31761981+litongjava@users.noreply.github.com>
-lnyan <lkwq007@gmail.com>
-m.bell <m.bell@techsmith.com>
-mkiol <mkiol@users.noreply.github.com>
-novag <7754358+novag@users.noreply.github.com>
-pajowu <pajowu@pajowu.de>
-polarmoon <90010972+polarmoon@users.noreply.github.com>
-rlapray <lapray.romain@gmail.com>
-sandrohanea <40202887+sandrohanea@users.noreply.github.com>
-semiformal-net <84111142+semiformal-net@users.noreply.github.com>
-shibukazu <61775791+shibukazu@users.noreply.github.com>
-shikokuchuo <53399081+shikokuchuo@users.noreply.github.com>
-slaren <slarengh@gmail.com>
-slashlib <slashlib@users.noreply.github.com>
-snadampal <87143774+snadampal@users.noreply.github.com>
-st-gr <38470677+st-gr@users.noreply.github.com>
-texmex76 <40733439+texmex76@users.noreply.github.com>
-thefinaldegree <thefinaldegree@gmail.com>
-trixirt <trix@redhat.com>
-ulatekh <ulatekh@yahoo.com>
-undef <undefdev@gmail.com>
-venkr <venkateshrameshkumar+1@gmail.com>
-vicalloy <zbirder@gmail.com>
-xdrudis <xavierdrudis@yahoo.es>
-zhouwg <6889919+zhouwg@users.noreply.github.com>
-布客飞龙 <562826179@qq.com>
-Артём Земляк <azemlyak@smart-consulting.ru>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,10 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-# Allow for the creation of solution folders.
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-
-project(whisper.cpp VERSION 1.5.5)
-set(SOVERSION 1)
+project(whisper.cpp VERSION 1.4.3)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -59,13 +55,10 @@ option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDA

 option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)

-option(WHISPER_NO_AVX                 "whisper: disable AVX"         OFF)
-option(WHISPER_NO_AVX2                "whisper: disable AVX2"        OFF)
-option(WHISPER_NO_AVX512              "whisper: disable AVX512"      ON)
-option(WHISPER_NO_AVX512_VBMI         "whisper: disable AVX512-VBMI" ON)
-option(WHISPER_NO_AVX512_VNNI         "whisper: disable AVX512-VNNI" ON)
-option(WHISPER_NO_FMA                 "whisper: disable FMA"         OFF)
-option(WHISPER_NO_F16C                "whisper: disable F16c"        OFF)
+option(WHISPER_NO_AVX                 "whisper: disable AVX"  OFF)
+option(WHISPER_NO_AVX2                "whisper: disable AVX2" OFF)
+option(WHISPER_NO_FMA                 "whisper: disable FMA"  OFF)
+option(WHISPER_NO_F16C                "whisper: disable F16c" OFF)

 option(WHISPER_OPENVINO               "whisper: support for OpenVINO" OFF)

@ -75,19 +68,13 @@ if (APPLE)
    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
-    option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
-    option(WHISPER_BLAS                  "whisper: use BLAS libraries"                        OFF)
-    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor"                       Generic)
-    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"                           OFF)
-    option(WHISPER_OPENBLAS_INTERFACE64  "whisper: use OpenBLAS w/ 64-bit interface"          OFF)
-    option(WHISPER_CUDA                  "whisper: support for CUDA"                          OFF)
-    option(WHISPER_CUBLAS                "whisper: support for CUDA (deprecated)"             OFF)
-    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS"                       OFF)
-    option(WHISPER_CLBLAST               "whisper: use CLBlast"                               OFF)
-    option(WHISPER_MKL                   "whisper: use Intel Math Kernel Library (MKL)"       OFF)
-    option(WHISPER_SYCL                  "whisper: use SYCL"                                  OFF)
-    option(WHISPER_SYCL_F16              "whisper: use 16 bit floats for sycl calculations"   OFF)
+    option(WHISPER_BLAS                  "whisper: use BLAS libraries"  OFF)
+    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
+    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"     OFF)
+    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"  OFF)
+    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS" OFF)
+    option(WHISPER_CLBLAST               "whisper: use CLBlast"         OFF)
 endif()

 option(WHISPER_PERF "whisper: enable perf timings" OFF)
@ -118,13 +105,6 @@ endif()

 find_package(Threads REQUIRED)

-#compile flag sycl
-if (WHISPER_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
-
 # on APPLE
 if (APPLE)
    # include Accelerate framework
@ -135,7 +115,7 @@ if (APPLE)
            message(STATUS "Accelerate framework found")

            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64)
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
        else()
            message(FATAL_ERROR "Accelerate framework not found")
        endif()
@ -165,42 +145,8 @@ if (APPLE)

        set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

-        # copy ggml-common.h and ggml-metal.metal to bin directory
-        configure_file(ggml-common.h    bin/ggml-common.h    COPYONLY)
+        # copy ggml-metal.metal to bin directory
        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
-
-        if (WHISPER_METAL_EMBED_LIBRARY)
-            enable_language(ASM)
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
-
-            set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
-            set(COMMON_HEADER "${CMAKE_SOURCE_DIR}/ggml-common.h")
-
-            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-            set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
-            set(EMBED_METALLIB_SOURCE "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-combined.metal")
-
-            add_custom_command(
-                OUTPUT ${EMBED_METALLIB_SOURCE}
-                COMMAND sed -e "/^#include \\\"ggml-common.h\\\"/r ${COMMON_HEADER}" -e "/^#include \\\"ggml-common.h\\\"/d" ${METALLIB_SOURCE} > ${EMBED_METALLIB_SOURCE}
-                DEPENDS ${METALLIB_SOURCE} ${COMMON_HEADER}
-                COMMENT "Generating combined Metal library for embedding"
-            )
-
-            add_custom_command(
-                OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".incbin \\\"${EMBED_METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-                DEPENDS ${EMBED_METALLIB_SOURCE}
-                COMMENT "Generate assembly for embedded Metal library"
-            )
-
-            set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-        endif()
    endif()

    if (WHISPER_COREML)
@ -224,82 +170,30 @@ endif()
 if (WHISPER_OPENBLAS)
    set(WHISPER_BLAS_VENDOR "OpenBLAS")
    set(WHISPER_BLAS ON)
-    # BLA_PKGCONFIG_BLAS is supported since CMake 3.25.
-    # FindBLAS.cmake pkg-config logic seems incomplete, because when
-    # BLA_SIZEOF_INTEGER is 8, then it should search for blas64 instead of blas.
-    # blas.pc/blas64.pc are not always provided, so let's be more specific
-    # and go with openblas.pc/openblas64.pc if WHISPER_OPENBLAS is on.
-    if (WHISPER_OPENBLAS_INTERFACE64)
-        set(WHISPER_BLAS_LIB "openblas64")
-    else ()
-        set(WHISPER_BLAS_LIB "openblas")
-    endif ()
-    set(BLA_PKGCONFIG_BLAS ${WHISPER_BLAS_LIB})
-    # OpenBLAS prebuilt libraries for Windows do not have "64" suffix in filename.
-    # (But .pc file has "64" suffix in filename for USE_64BITINT=1 Windows build.)
-    if (MSVC)
-        set(WHISPER_BLAS_LIB "openblas")
-    endif ()
 endif()

 if (WHISPER_BLAS)
-    if (NOT "$ENV{OPENBLAS_PATH}" STREQUAL "")
-        if (WHISPER_STATIC)
-            set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
-            set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+    if (WIN32)
+        if(DEFINED ENV{OPENBLAS_PATH})
+            set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
+            message(STATUS "Libraries ${BLAS_LIBRARIES}")
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+            include_directories($ENV{OPENBLAS_PATH}/include)
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else ()
-            if (CMAKE_IMPORT_LIBRARY_SUFFIX)
-                set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_IMPORT_LIBRARY_PREFIX})
-                set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_IMPORT_LIBRARY_SUFFIX})
-            else ()
-                set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
-                set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
-            endif ()
+            message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
        endif ()
-        # OpenBLAS prebuilt libraries hardcode "lib" prefix in filename even on Windows
-        if (WHISPER_OPENBLAS)
-            set(WHISPER_BLAS_LIB_PREFIX "lib")
-        endif ()
-        message(STATUS "BLAS compatible library path provided")
-        set(BLAS_LIBRARIES "$ENV{OPENBLAS_PATH}/lib/${WHISPER_BLAS_LIB_PREFIX}${WHISPER_BLAS_LIB}${WHISPER_BLAS_LIB_SUFFIX}")
-        message(STATUS "Libraries ${BLAS_LIBRARIES}")
-        set(BLAS_INCLUDE_DIRS "$ENV{OPENBLAS_PATH}/include")
-        message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}")
-        if (NOT EXISTS "${BLAS_LIBRARIES}")
-            message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH misdefined.")
-        endif ()
-        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-        include_directories(${BLAS_INCLUDE_DIRS})
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
    else ()
-        if (WHISPER_STATIC)
-            # FindBLAS.cmake pkg-config logic seems incomplete, because when
-            # BLA_STATIC is on, then it should use pkg_check_modules_static
-            # instead of pkg_check_modules.
-            # Some manual variable overriding may be necessary if you don't
-            # achieve desired results.
-            set(BLA_STATIC 1)
-        endif ()
+        set(BLA_STATIC 1)
        set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
-        if (WHISPER_OPENBLAS_INTERFACE64)
-            set(BLA_SIZEOF_INTEGER 8)
-        else ()
-            set(BLA_SIZEOF_INTEGER 4)
-        endif()
+        set(BLA_SIZEOF_INTEGER 8)
        set(BLA_PREFER_PKGCONFIG 1)
        find_package(BLAS)

        if(BLAS_FOUND)
            message(STATUS "BLAS compatible library found")
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
-            if (NOT DEFINED BLAS_INCLUDE_DIRS)
-                if (PKGC_BLAS_FOUND)
-                    set(BLAS_INCLUDE_DIRS "${PKGC_BLAS_INCLUDE_DIRS}")
-                else ()
-                    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas)
-                endif()
-            endif()
-            message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}")
+            find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories(${BLAS_INCLUDE_DIRS})
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
@ -309,19 +203,7 @@ if (WHISPER_BLAS)
    endif ()
 endif ()

-if (WHISPER_MKL)
-    find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT})
-    message(STATUS "Imported oneMKL targets: ${MKL_IMPORTED_TARGETS}")
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_BLAS_USE_MKL)
-endif()
-
 if (WHISPER_CUBLAS)
-    message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
-    set(WHISPER_CUDA ON)
-endif()
-
-if (WHISPER_CUDA)
    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
@ -331,24 +213,16 @@ if (WHISPER_CUDA)

        enable_language(CUDA)

-        file(GLOB   GGML_SOURCES_CUDA "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.h)
-        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.cu)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

-        add_compile_definitions(GGML_USE_CUDA)
+        add_compile_definitions(GGML_USE_CUBLAS)

        if (WHISPER_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
    else()
        message(FATAL_ERROR "cuBLAS not found")
    endif()
@ -370,7 +244,7 @@ if (WHISPER_HIPBLAS)

    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
@ -404,30 +278,6 @@ if( WHISPER_OPENVINO )
    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()

-if (WHISPER_SYCL)
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
-    endif()
-    #todo: AOT
-
-    find_package(IntelSYCL REQUIRED)
-    if (WHISPER_SYCL_F16)
-        add_compile_definitions(GGML_SYCL_F16)
-    endif()
-    add_compile_definitions(GGML_USE_SYCL)
-
-    add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})
-
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-
-    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
-
-    set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-endif()
 # compiler flags

 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -459,8 +309,7 @@ if (WHISPER_ALL_WARNINGS)
 endif()

 if (NOT MSVC)
-    # TODO: temporary disabled until we figure out ggml-metal.m
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()

@ -476,35 +325,21 @@ else()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
-        if(NOT WHISPER_NO_AVX512)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX512")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (NOT WHISPER_NO_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (NOT WHISPER_NO_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-        elseif(NOT WHISPER_NO_AVX2)
+        if(NOT WHISPER_NO_AVX2)
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-        elseif(NOT WHISPER_NO_AVX)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
+        else()
+            if(NOT WHISPER_NO_AVX)
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
+            endif()
        endif()
    else()
        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
@ -512,15 +347,6 @@ else()
            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
            endif()
-            if(NOT WHISPER_NO_AVX512)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw")
-            endif()
-            if(NOT WHISPER_NO_AVX512_VBMI)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vbmi")
-            endif()
-            if(NOT WHISPER_NO_AVX512_VNNI)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vnni")
-            endif()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
@ -607,7 +433,6 @@ if (WHISPER_COREML)
    set_target_properties(${TARGET} PROPERTIES
        COMPILE_FLAGS "-fobjc-arc"
        )
-    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()

 if (WHISPER_OPENVINO)
@ -626,7 +451,6 @@ if (WHISPER_OPENVINO)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)

    target_link_libraries(${TARGET} PRIVATE openvino::runtime)
-    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()

 #
@ -647,18 +471,10 @@ add_library(${TARGET}
    ${GGML_SOURCES_METAL}
    ${GGML_SOURCES_CUDA}
    ${GGML_SOURCES_OPENCL}
-    ${GGML_SOURCES_SYCL}
-    ${GGML_HEADERS_SYCL}
    whisper.h
    whisper.cpp
    )

-# Set the version numbers
-set_target_properties(whisper PROPERTIES
-    VERSION ${PROJECT_VERSION}
-    SOVERSION ${SOVERSION}
-)
-
 include(DefaultTargetOptions)

 target_include_directories(${TARGET} PUBLIC
@ -673,10 +489,6 @@ if (WHISPER_OPENVINO)
    target_link_libraries(${TARGET} PRIVATE whisper.openvino)
 endif()

-if (WHISPER_MKL)
-    target_link_libraries(${TARGET} PUBLIC MKL::MKL)
-endif()
-
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

@ -686,7 +498,6 @@ else()
 endif()

 if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_link_libraries(${TARGET} PUBLIC
        ${CMAKE_DL_LIBS}
        )
@ -710,13 +521,7 @@ endif()

 if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    # Only configure gmml CUDA architectures is not globally set
-    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
-        # Not overriden by user, so set defaults
-        set(GGML_CUDA_ARCHITECTURES 52 61 70)
-    endif()
-    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()

@ -728,8 +533,7 @@ target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )

-set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "ggml.h;whisper.h")
-set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")

 include(GNUInstallDirs)

--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2023 Georgi Gerganov

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/138
+++ b/138
@ -1,4 +1,4 @@
-default: main bench quantize server
+default: main bench quantize

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -42,12 +42,6 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

-ifdef MACOSX_DEPLOYMENT_TARGET
-	CFLAGS   += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-	CXXFLAGS += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-	LDFLAGS  += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-endif
-
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
@ -105,16 +99,6 @@ ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$
 	CXXFLAGS += -pthread
 endif

-# detect Windows
-ifneq ($(findstring _NT,$(UNAME_S)),)
-	_WIN32 := 1
-endif
-
-# Windows Sockets 2 (Winsock) for network-capable apps
-ifeq ($(_WIN32),1)
-	LWINSOCK2 := -lws2_32
-endif
-
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@ -123,7 +107,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 		CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
 	else ifeq ($(UNAME_S),Linux)
 		CPUINFO_CMD := cat /proc/cpuinfo
-	else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
+	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
 		CPUINFO_CMD := grep Features /var/run/dmesg.boot
@ -144,24 +128,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 			CXXFLAGS += -mavx2
 		endif

-		AVX512F_M := $(shell $(CPUINFO_CMD) | grep -iw 'AVX512F')
-		ifneq (,$(AVX512F_M))
-			CFLAGS   += -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw
-			CXXFLAGS += -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw
-		endif
-
-		AVX512VNNI_M := $(shell $(CPUINFO_CMD) | grep -iwE 'AVX512_VNNI|AVX512VNNI')
-		ifneq (,$(AVX512VNNI_M))
-			CFLAGS   += -mavx512vnni
-			CXXFLAGS += -mavx512vnni
-		endif
-
-		AVX512VBMI_M := $(shell $(CPUINFO_CMD) | grep -iw 'AVX512VBMI')
-		ifneq (,$(AVX512VBMI_M))
-			CFLAGS   += -mavx512vbmi
-			CXXFLAGS += -mavx512vbmi
-		endif
-
 		FMA_M := $(shell $(CPUINFO_CMD) | grep -iw 'FMA')
 		ifneq (,$(FMA_M))
 			CFLAGS   += -mfma
@ -203,8 +169,6 @@ ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
-		CFLAGS  += -DACCELERATE_NEW_LAPACK
-		CFLAGS  += -DACCELERATE_LAPACK_ILP64
 		LDFLAGS += -framework Accelerate
 	endif
 endif
@ -228,54 +192,26 @@ ifndef WHISPER_NO_METAL
 	endif
 endif

-ifneq ($(filter-out 0,$(WHISPER_OPENBLAS)),) # OpenBLAS
-	WHISPER_OPENBLAS_INTERFACE64 ?= 0 # use 32-bit interface by default
-	ifneq ($(filter-out 0,$(WHISPER_OPENBLAS_INTERFACE64)),)
-		WHISPER_BLAS_LIB := openblas64
-	else
-		WHISPER_BLAS_LIB := openblas
-	endif
-	ifneq ($(OPENBLAS_PATH),)
-		WHISPER_BLAS_CFLAGS  := -I$(OPENBLAS_PATH)/include
-		WHISPER_BLAS_LDFLAGS := -L$(OPENBLAS_PATH)/lib -l$(WHISPER_BLAS_LIB)
-	else
-		WHISPER_BLAS_LIB_PC_EXISTS := $(shell pkg-config --exists $(WHISPER_BLAS_LIB) && echo 1)
-		ifneq ($(filter-out 0,$(WHISPER_BLAS_LIB_PC_EXISTS)),)
-			WHISPER_BLAS_CFLAGS  := $(shell pkg-config --cflags $(WHISPER_BLAS_LIB))
-			WHISPER_BLAS_LDFLAGS := $(shell pkg-config --libs   $(WHISPER_BLAS_LIB))
-		else
-			WHISPER_BLAS_CFLAGS  := -I/usr/include/openblas
-			WHISPER_BLAS_LDFLAGS := -l$(WHISPER_BLAS_LIB)
-		endif
-	endif
-	CFLAGS  += $(WHISPER_BLAS_CFLAGS) -DGGML_USE_OPENBLAS
-	LDFLAGS += $(WHISPER_BLAS_LDFLAGS)
+ifdef WHISPER_OPENBLAS
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
+	LDFLAGS += -lopenblas
 endif

 ifdef WHISPER_CUBLAS
-# WHISPER_CUBLAS is deprecated and will be removed in the future
-	WHISPER_CUDA := 1
-endif
-
-ifdef WHISPER_CUDA
 	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
-		CUDA_ARCH_FLAG ?= native
+		CUDA_ARCH_FLAG=native
 	else
-		CUDA_ARCH_FLAG ?= all
+		CUDA_ARCH_FLAG=all
 	endif

-	CFLAGS      += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	CXXFLAGS    += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	LDFLAGS     += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
+	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
-	WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)

-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
-
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif

@ -283,18 +219,14 @@ ifdef WHISPER_HIPBLAS
 	ROCM_PATH   ?= /opt/rocm
 	HIPCC       ?= $(ROCM_PATH)/bin/hipcc
 	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
-	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	LDFLAGS     += -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	WHISPER_OBJ += ggml-cuda.o
-	WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))

-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif

@ -359,13 +291,6 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-ifdef WHISPER_CUBLAS
-$(info !!!!)
-$(info WHISPER_CUBLAS is deprecated and will be removed in the future. Use WHISPER_CUDA instead.)
-$(info !!!!)
-$(info )
-endif
-
 #
 # Build library
 #
@ -404,26 +329,6 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@

 WHISPER_OBJ += ggml-metal.o
-
-ifdef WHISPER_METAL_EMBED_LIBRARY
-CFLAGS += -DGGML_METAL_EMBED_LIBRARY
-
-ggml-metal-embed.o: ggml-metal.metal ggml-common.h
-	@echo "Embedding Metal library"
-	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	$(eval TEMP_METALLIB=$(shell mktemp))
-	@sed "/^#include \"ggml-common.h\"/{r ggml-common.h"$$'\n'"d;}" ggml-metal.metal > $(TEMP_METALLIB)
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$(TEMP_METALLIB)\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
-	@$(AS) $(TEMP_ASSEMBLY) -o $@
-	@rm -f $(TEMP_ASSEMBLY) $(TEMP_METALLIB)
-
-WHISPER_OBJ += ggml-metal-embed.o
-endif
 endif

 libwhisper.a: $(WHISPER_OBJ)
@ -433,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so

 #
 # Examples
@ -441,7 +346,7 @@ clean:

 CC_SDL=`sdl2-config --cflags --libs`

-SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp examples/grammar-parser.cpp
+SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp

 main: examples/main/main.cpp $(SRC_COMMON) $(WHISPER_OBJ)
@ -454,9 +359,6 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
 quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

-server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS) $(LWINSOCK2)
-
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

@ -469,8 +371,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)

-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)

 #
 # Audio samples
@ -516,9 +418,9 @@ samples:
 .PHONY: medium
 .PHONY: large-v1
 .PHONY: large-v2
-.PHONY: large-v3
+.PHONY: large

-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/Package.swift
+++ b/Package.swift
@ -2,14 +2,33 @@

 import PackageDescription

+#if arch(arm) || arch(arm64)
+let platforms: [SupportedPlatform]? = [
+    .macOS(.v12),
+    .iOS(.v14),
+    .watchOS(.v4),
+    .tvOS(.v14)
+]
+let exclude: [String] = []
+let resources: [Resource] = [
+    .process("ggml-metal.metal")
+]
+let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSettings: [CSetting] = [
+    .unsafeFlags(["-fno-objc-arc"]),
+    .define("GGML_USE_METAL")
+]
+#else
+let platforms: [SupportedPlatform]? = nil
+let exclude: [String] = ["ggml-metal.metal"]
+let resources: [Resource] = []
+let additionalSources: [String] = []
+let additionalSettings: [CSetting] = []
+#endif
+
 let package = Package(
    name: "whisper",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
+    platforms: platforms,
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
@ -17,7 +36,7 @@ let package = Package(
        .target(
            name: "whisper",
            path: ".",
-            exclude: [
+            exclude: exclude + [
               "bindings",
               "cmake",
               "coreml",
@ -36,22 +55,19 @@ let package = Package(
                "whisper.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m"
-            ],
-            resources: [.process("ggml-metal.metal")],
+                "ggml-quants.c"
+            ] + additionalSources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL")
+                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
+            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.5.5](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.5) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -16,10 +16,12 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
+- Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Support for CPU-only inference
- [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
+- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
+- [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
 - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)

@ -33,10 +35,11 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)

-The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
-The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
+The entire implementation of the model is contained in 2 source files:
+
+- Tensor operations: [ggml.h](ggml.h) / [ggml.c](ggml.c)
+- Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)

 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
 As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
@ -61,22 +64,22 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder

-The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
+The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
+intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
+the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.

 ## Quick start

-First clone the repository:
+First clone the repository.

-```bash
-git clone https://github.com/ggerganov/whisper.cpp.git
-```
-
-Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
+Then, download one of the Whisper models converted in [ggml format](models). For example:

 ```bash
 bash ./models/download-ggml-model.sh base.en
 ```

+If you wish to convert the Whisper models to ggml format yourself, instructions are in [models/README.md](models/README.md).
+
 Now build the [main](examples/main) example and transcribe an audio file like this:

 ```bash
@ -91,7 +94,7 @@ make

 For a quick demo, simply run `make base.en`:

-```text
+```java
 $ make base.en

 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
@ -111,8 +114,8 @@ options:
  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
  -ml N,     --max-len N         [0      ] maximum segment length in characters
  -sow,      --split-on-word     [false  ] split on word rather than on token
-  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [5      ] beam size for beam search
+  -bo N,     --best-of N         [2      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@ -129,7 +132,6 @@ options:
  -fp,       --font-path         [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
-  -ojf,      --output-json-full  [false  ] include more information in the JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
@ -141,8 +143,7 @@ options:
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  -ls,       --log-score         [false  ] log best decoder scores of tokens
-  -ng,       --no-gpu            [false  ] disable GPU
+  -ls,       --log-score         [false  ] log best decoder scores of token


 bash ./models/download-ggml-model.sh base.en
@ -207,7 +208,7 @@ For detailed usage instructions, run: `./main -h`
 Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
 For example, you can use `ffmpeg` like this:

-```bash
+```java
 ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 ```

@ -234,18 +235,18 @@ make medium.en
 make medium
 make large-v1
 make large-v2
-make large-v3
+make large
 ```

 ## Memory usage

-| Model  | Disk    | Mem     |
-| ------ | ------- | ------- |
-| tiny   | 75 MiB  | ~273 MB |
-| base   | 142 MiB | ~388 MB |
-| small  | 466 MiB | ~852 MB |
-| medium | 1.5 GiB | ~2.1 GB |
-| large  | 2.9 GiB | ~3.9 GB |
+| Model  | Disk   | Mem     | SHA                                        |
+| ---    | ---    | ---     | ---                                        |
+| tiny   |  75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| large  | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |

 ## Quantization

@ -278,8 +279,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
  - Python 3.10 is recommended.
-  - MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
-  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
+  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html)  for this step:
    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
    - To activate the environment, use: `conda activate py310-whisper`

@ -305,8 +305,8 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

 - Run the examples as usual. For example:

-  ```text
-  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```bash
+  ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -334,23 +334,21 @@ This can result in significant speedup in encoder performance. Here are the inst
 - First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.

  Windows:
-
-  ```powershell
+  ```
  cd models
  python -m venv openvino_conv_env
  openvino_conv_env\Scripts\activate
  python -m pip install --upgrade pip
-  pip install -r requirements-openvino.txt
+  pip install -r openvino-conversion-requirements.txt
  ```

  Linux and macOS:
-
-  ```bash
+  ```
  cd models
  python3 -m venv openvino_conv_env
  source openvino_conv_env/bin/activate
  python -m pip install --upgrade pip
-  pip install -r requirements-openvino.txt
+  pip install -r openvino-conversion-requirements.txt
  ```

 - Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
@ -359,7 +357,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  python convert-whisper-to-openvino.py --model base.en
  ```

-  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as `ggml` models, as that
+  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
  is the default location that the OpenVINO extension will search at runtime.

 - Build `whisper.cpp` with OpenVINO support:
@ -369,28 +367,24 @@ This can result in significant speedup in encoder performance. Here are the inst
  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:

  Linux:
-
  ```bash
  source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
  ```

  Windows (cmd):
-
-  ```powershell
+  ```
  C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
  ```

  And then build the project using cmake:
-
  ```bash
  cmake -B build -DWHISPER_OPENVINO=1
  cmake --build build -j --config Release
  ```

 - Run the examples as usual. For example:
-
-  ```text
-  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```bash
+  ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -406,19 +400,19 @@ This can result in significant speedup in encoder performance. Here are the inst

  The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
  cached for the next run.
-
+  
 For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).

-## NVIDIA GPU support
+## NVIDIA GPU support via cuBLAS

-With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
+With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS.
 First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads

-Now build `whisper.cpp` with CUDA support:
+Now build `whisper.cpp` with cuBLAS support:

 ```
 make clean
-WHISPER_CUDA=1 make -j
+WHISPER_CUBLAS=1 make -j
 ```

 ## OpenCL GPU support via CLBlast
@ -441,6 +435,7 @@ cmake -B build -DWHISPER_CLBLAST=ON
 cmake --build build -j --config Release
 ```

+
 Run all the examples as usual.

 ## BLAS CPU support via OpenBLAS
@ -455,53 +450,6 @@ make clean
 WHISPER_OPENBLAS=1 make -j
 ```

-## BLAS CPU support via Intel MKL
-
-Encoder processing can be accelerated on the CPU via the BLAS compatible interface of Intel's Math Kernel Library.
-First, make sure you have installed Intel's MKL runtime and development packages: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html
-
-Now build `whisper.cpp` with Intel MKL BLAS support:
-
-```
-source /opt/intel/oneapi/setvars.sh
-mkdir build
-cd build
-cmake -DWHISPER_MKL=ON ..
-WHISPER_MKL=1 make -j
-```
-
-## Docker
-
-### Prerequisites
-
- Docker must be installed and running on your system.
- Create a folder to store big models & intermediate files (ex. /whisper/models)
-
-### Images
-
-We have two Docker images available for this project:
-
-1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
-
-### Usage
-
-```shell
-# download model and persist it in a local folder
-docker run -it --rm \
-  -v path/to/models:/models \
-  whisper.cpp:main "./models/download-ggml-model.sh base /models"
-# transcribe an audio file
-docker run -it --rm \
-  -v path/to/models:/models \
-  -v path/to/audios:/audios \
-  whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
-# transcribe an audio file in samples folder
-docker run -it --rm \
-  -v path/to/models:/models \
-  whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
-```
-
 ## Limitations

 - Inference only
@ -514,7 +462,7 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 <details>
  <summary>Expand to see the result</summary>

-```text
+```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8

 whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
@ -586,7 +534,6 @@ whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per
 whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
 whisper_print_timings:    total time = 32733.52 ms
 ```
-
 </details>

 ## Real-time audio input example
@ -595,7 +542,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```bash
+```java
 make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@ -607,7 +554,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
 to highlight words with high or low confidence:

-```bash
+```java
 ./main -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
 ```

@ -617,8 +564,8 @@ to highlight words with high or low confidence:

 For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:

-```text
-$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
+```java
+./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -641,8 +588,8 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr

 The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:

-```text
-$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
+```java
+./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -712,7 +659,7 @@ This requires to have `ffmpeg` installed.

 Here are a few *"typical"* examples:

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
 source ./samples/jfk.wav.wts
 ffplay ./samples/jfk.wav.mp4
@ -722,7 +669,7 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b

 ---

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
 source ./samples/mm0.wav.wts
 ffplay ./samples/mm0.wav.mp4
@ -732,7 +679,7 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9

 ---

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
 source ./samples/gb0.wav.wts
 ffplay ./samples/gb0.wav.mp4
@ -744,10 +691,10 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 ## Video comparison of different models

-Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
+Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:

-```bash
-./scripts/bench-wts.sh samples/jfk.wav
+```java
+./extra/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```

@ -768,14 +715,15 @@ Additionally a script to run whisper.cpp with different models and audio files i
 You can run it with the following command, by default it will run against any standard model in the models folder.

 ```bash
-python3 scripts/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
+python3 extra/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
 ```

 It is written in python with the intention of being easy to modify and extend for your benchmarking use case.

 It outputs a csv file with the results of the benchmarking.

-## `ggml` format
+
+## ggml format

 The original models are converted to a custom binary format. This allows to pack everything needed into a single file:

@ -790,50 +738,49 @@ or manually from here:
 - https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

-For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
+For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
+in [models](models).

 ## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)

- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
+- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
+- [X] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [x] Java:
+- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
+- [X] Java:
  - [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
+- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
+- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
  - [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
+- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
+- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [X] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

 ## Examples

 There are various examples of using the library for different projects in the [examples](examples) folder.
 Some of the examples are even ported to run in the browser using WebAssembly. Check them out!

-| Example                                             | Web                                   | Description                                                                                                                     |
-| --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
-| [main](examples/main)                               | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper                                                                       |
-| [bench](examples/bench)                             | [bench.wasm](examples/bench.wasm)     | Benchmark the performance of Whisper on your machine                                                                            |
-| [stream](examples/stream)                           | [stream.wasm](examples/stream.wasm)   | Real-time transcription of raw microphone capture                                                                               |
-| [command](examples/command)                         | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic                                                         |
-| [wchess](examples/wchess)                           | [wchess.wasm](examples/wchess)        | Voice-controlled chess                                                                                                          |
-| [talk](examples/talk)                               | [talk.wasm](examples/talk.wasm)       | Talk with a GPT-2 bot                                                                                                           |
-| [talk-llama](examples/talk-llama)                   |                                       | Talk with a LLaMA bot                                                                                                           |
-| [whisper.objc](examples/whisper.objc)               |                                       | iOS mobile application using whisper.cpp                                                                                        |
-| [whisper.swiftui](examples/whisper.swiftui)         |                                       | SwiftUI iOS / macOS application using whisper.cpp                                                                               |
-| [whisper.android](examples/whisper.android)         |                                       | Android mobile application using whisper.cpp                                                                                    |
-| [whisper.nvim](examples/whisper.nvim)               |                                       | Speech-to-text plugin for Neovim                                                                                                |
-| [generate-karaoke.sh](examples/generate-karaoke.sh) |                                       | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture                           |
-| [livestream.sh](examples/livestream.sh)             |                                       | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185)                                           |
-| [yt-wsp.sh](examples/yt-wsp.sh)                     |                                       | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
-| [server](examples/server)                           |                                       | HTTP transcription server with OAI-like API                                                                                     |
+| Example | Web | Description |
+| ---     | --- | ---         |
+| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
+| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
+| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
+| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
+| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
+| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
+| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
+| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
+| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
+| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
+| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
+| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |

 ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)

--- a/README_sycl.md
+++ b/README_sycl.md
@ -1,249 +0,0 @@
-# whisper.cpp for SYCL
-
-[Background](#background)
-
-[OS](#os)
-
-[Intel GPU](#intel-gpu)
-
-[Linux](#linux)
-
-[Environment Variable](#environment-variable)
-
-[Known Issue](#known-issue)
-
-[Todo](#todo)
-
-## Background
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<72>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
-
-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
-
-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
-
-To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
-
-The whisper.cpp for SYCL is used to support Intel GPUs.
-
-For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
-
-## OS
-
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04|
-|Windows|Ongoing| |
-
-
-## Intel GPU
-
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
-
-
-## Linux
-
-### Setup Environment
-
-1. Install Intel GPU driver.
-
-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
-
-Note: for iGPU, please install the client GPU driver.
-
-b. Add user to group: video, render.
-
-```
-sudo usermod -aG render username
-sudo usermod -aG video username
-```
-
-Note: re-login to enable it.
-
-c. Check
-
-```
-sudo apt install clinfo
-sudo clinfo -l
-```
-
-Output (example):
-
-```
-Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
-
-Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
-
-2. Install Intel<65> oneAPI Base toolkit.
-
-
-a. Please follow the procedure in [Get the Intel<65> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-
-Recommend to install to default folder: **/opt/intel/oneapi**.
-
-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
-
-b. Check
-
-```
-source /opt/intel/oneapi/setvars.sh
-
-sycl-ls
-```
-
-There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
-```
-
-2. Build locally:
-
-```
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-
-#for FP16
-#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON 
-
-#for FP32
-cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-#build example/main only
-#cmake --build . --config Release --target main
-
-#build all binary
-cmake --build . --config Release -v
-
-```
-
-or
-
-```
-./examples/sycl/build.sh
-```
-
-Note:
-
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
-
-### Run
-
-1. Put model file to folder **models**
-
-2. Enable oneAPI running environment
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. List device ID
-
-Run without parameter:
-
-```
-./build/bin/ls-sycl-device
-
-or
-
-./build/bin/main
-```
-
-Check the ID in startup log, like:
-
-```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
-```
-
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-
-4. Set device ID and execute whisper.cpp
-
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
-
-```
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
-```
-or run by script:
-
-```
-./examples/sycl/run_whisper.sh
-```
-
-
-
-5. Check the device ID in output
-
-Like:
-```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
-```
-
-
-## Environment Variable
-
-#### Build
-
-|Name|Value|Function|
-|-|-|-|
-|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
-|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
-
-#### Running
-
-
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-
-## Known Issue
-
- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
-
-  Miss to enable oneAPI running environment.
-
-  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
-
-
- Hang during startup
-
-  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
-
-  Solution: add **--no-mmap**.
-
-## Todo
-
- Support to build in Windows.
-
- Support multiple cards.
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -1,26 +1,9 @@
-ifndef UNAME_S
-UNAME_S := $(shell uname -s)
-endif
-
-ifndef UNAME_P
-UNAME_P := $(shell uname -p)
-endif
-
-ifndef UNAME_M
-UNAME_M := $(shell uname -m)
-endif
-
-GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
 INCLUDE_PATH := $(abspath ../..)
 LIBRARY_PATH := $(abspath ../..)

-ifeq ($(UNAME_S),Darwin)
-	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
-endif
-
 all: clean whisper examples

 whisper: mkdir
@ -28,13 +11,8 @@ whisper: mkdir
 	@${MAKE} -C ../.. libwhisper.a

 test: model-small whisper modtidy
-ifeq ($(UNAME_S),Darwin)
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v .
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v ./pkg/whisper/...
-else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
-endif

 examples: $(EXAMPLES_DIR)

@ -43,11 +21,7 @@ model-small: mkdir examples/go-model-download

 $(EXAMPLES_DIR): mkdir whisper modtidy
 	@echo Build example $(notdir $@)
-ifeq ($(UNAME_S),Darwin)
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go build ${BUILD_FLAGS} -ldflags "-extldflags '$(EXT_LDFLAGS)'" -o ${BUILD_DIR}/$(notdir $@) ./$@
-else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
-endif

 mkdir:
 	@echo Mkdir ${BUILD_DIR}
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (

 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
 )

 var (
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -123,11 +123,6 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }

-// Set initial prompt
-func (p *Params) SetInitialPrompt(prompt string) {
-	p.initial_prompt = C.CString(prompt)
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

@ -152,7 +147,6 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
-	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -130,11 +130,6 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }

-// Set initial prompt
-func (context *context) SetInitialPrompt(prompt string) {
-	context.params.SetInitialPrompt(prompt)
-}
-
 // ResetTimings resets the mode timings. Should be called before processing
 func (context *context) ResetTimings() {
 	context.model.ctx.Whisper_reset_timings()
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,18 +38,17 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language

-	SetOffset(time.Duration)        // Set offset
-	SetDuration(time.Duration)      // Set duration
-	SetThreads(uint)                // Set number of threads to use
-	SetSpeedup(bool)                // Set speedup flag
-	SetSplitOnWord(bool)            // Set split on word flag
-	SetTokenThreshold(float32)      // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)       // Set max segment length in characters
-	SetTokenTimestamps(bool)        // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)               // Set audio encoder context
-	SetInitialPrompt(prompt string) // Set initial prompt
+	SetOffset(time.Duration)      // Set offset
+	SetDuration(time.Duration)    // Set duration
+	SetThreads(uint)              // Set number of threads to use
+	SetSpeedup(bool)              // Set speedup flag
+	SetSplitOnWord(bool)          // Set split on word flag
+	SetTokenThreshold(float32)    // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)     // Set max segment length in characters
+	SetTokenTimestamps(bool)      // Set token timestamps flag
+	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
+	SetAudioCtx(uint)             // Set audio encoder context

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -10,7 +10,7 @@ import (

 /*
 #cgo LDFLAGS: -lwhisper -lm -lstdc++
-#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
+#cgo darwin LDFLAGS: -framework Accelerate
 #include <whisper.h>
 #include <stdlib.h>

--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/java/build.gradle
+++ b/bindings/java/build.gradle
@ -9,7 +9,6 @@ archivesBaseName = 'whispercpp'
 group = 'io.github.ggerganov'
 version = '1.4.0'

-
 sourceCompatibility = 1.8
 targetCompatibility = 1.8

--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
@ -2,7 +2,6 @@ package io.github.ggerganov.whispercpp;

 import com.sun.jna.Native;
 import com.sun.jna.Pointer;
-import io.github.ggerganov.whispercpp.bean.WhisperSegment;
 import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 import io.github.ggerganov.whispercpp.params.WhisperFullParams;
 import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
@ -10,8 +9,6 @@ import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;

 /**
 * Before calling most methods, you must call `initContext(modelPath)` to initialise the `ctx` Pointer.
@ -163,28 +160,6 @@ public class WhisperCpp implements AutoCloseable {

        return str.toString().trim();
    }
-    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
-        if (ctx == null) {
-            throw new IllegalStateException("Model not initialised");
-        }
-
-        if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
-            throw new IOException("Failed to process audio");
-        }
-
-        int nSegments = lib.whisper_full_n_segments(ctx);
-        List<WhisperSegment> segments= new ArrayList<>(nSegments);
-
-
-        for (int i = 0; i < nSegments; i++) {
-            long t0 = lib.whisper_full_get_segment_t0(ctx, i);
-            String text = lib.whisper_full_get_segment_text(ctx, i);
-            long t1 = lib.whisper_full_get_segment_t1(ctx, i);
-            segments.add(new WhisperSegment(t0,t1,text));
-        }
-
-        return segments;
-    }

 //    public int getTextSegmentCount(Pointer ctx) {
 //        return lib.whisper_full_n_segments(ctx);
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/bean/WhisperSegment.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/bean/WhisperSegment.java
@ -1,47 +0,0 @@
-package io.github.ggerganov.whispercpp.bean;
-
-/**
- * Created by litonglinux@qq.com on 10/21/2023_7:48 AM
- */
-public class WhisperSegment {
-  private long start, end;
-  private String sentence;
-
-  public WhisperSegment() {
-  }
-
-  public WhisperSegment(long start, long end, String sentence) {
-    this.start = start;
-    this.end = end;
-    this.sentence = sentence;
-  }
-
-  public long getStart() {
-    return start;
-  }
-
-  public long getEnd() {
-    return end;
-  }
-
-  public String getSentence() {
-    return sentence;
-  }
-
-  public void setStart(long start) {
-    this.start = start;
-  }
-
-  public void setEnd(long end) {
-    this.end = end;
-  }
-
-  public void setSentence(String sentence) {
-    this.sentence = sentence;
-  }
-
-  @Override
-  public String toString() {
-    return "[" + start + " --> " + end + "]:" + sentence;
-  }
-}
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@ -58,9 +58,6 @@ public class WhisperFullParams extends Structure {
        no_context = enable ? CBool.FALSE : CBool.TRUE;
    }

-    /** Generate timestamps or not? */
-    public CBool no_timestamps;
-
    /** Flag to force single segment output (useful for streaming). (default = false) */
    public CBool single_segment;

@ -148,9 +145,6 @@ public class WhisperFullParams extends Structure {
        tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
    }

-    /** Regular expression matching tokens to suppress. */
-    public String suppress_regex;
-
    /** Tokens to provide to the whisper decoder as an initial prompt.
     * These are prepended to any existing text context from a previous call. */
    public String initial_prompt;
@ -310,25 +304,18 @@ public class WhisperFullParams extends Structure {
        logits_filter_callback = CallbackReference.getFunctionPointer(callback);
    }

-    /** Grammar stuff */
-    public Pointer grammar_rules;
-    public long n_grammar_rules;
-    public long i_start_rule;
-    public float grammar_penalty;
-
    @Override
    protected List<String> getFieldOrder() {
        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
-                "no_context", "single_segment", "no_timestamps",
+                "no_context", "single_segment",
                "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
-                "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
+                "tdrz_enable", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
                "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
                "new_segment_callback", "new_segment_callback_user_data",
                "progress_callback", "progress_callback_user_data",
                "encoder_begin_callback", "encoder_begin_callback_user_data",
-                "logits_filter_callback", "logits_filter_callback_user_data",
-                "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
+                "logits_filter_callback", "logits_filter_callback_user_data");
    }
 }
--- a/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
+++ b/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
@ -2,7 +2,6 @@ package io.github.ggerganov.whispercpp;

 import static org.junit.jupiter.api.Assertions.*;

-import io.github.ggerganov.whispercpp.bean.WhisperSegment;
 import io.github.ggerganov.whispercpp.params.CBool;
 import io.github.ggerganov.whispercpp.params.WhisperFullParams;
 import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
@ -12,7 +11,6 @@ import javax.sound.sampled.AudioInputStream;
 import javax.sound.sampled.AudioSystem;
 import java.io.File;
 import java.io.FileNotFoundException;
-import java.util.List;

 class WhisperCppTest {
    private static WhisperCpp whisper = new WhisperCpp();
@ -22,12 +20,11 @@ class WhisperCppTest {
    static void init() throws FileNotFoundException {
        // By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
        // or you can provide the absolute path to the model file.
-        //String modelName = "../../models/ggml-tiny.bin";
        String modelName = "../../models/ggml-tiny.en.bin";
        try {
            whisper.initContext(modelName);
-            //whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-            //whisper.getJavaDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
+//            whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
+//            whisper.getJavaDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
            modelInitialised = true;
        } catch (FileNotFoundException ex) {
            System.out.println("Model " + modelName + " not found");
@ -45,7 +42,7 @@ class WhisperCppTest {
        assertEquals(16384, params.n_max_text_ctx);
        assertFalse(params.translate);
        assertEquals(0.01f, params.thold_pt);
-        assertEquals(5, params.beam_search.beam_size);
+        assertEquals(2, params.beam_search.beam_size);
        assertEquals(-1.0f, params.beam_search.patience);
    }

@ -58,7 +55,7 @@ class WhisperCppTest {
        assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
        assertNotEquals(0, params.n_threads);
        assertEquals(16384, params.n_max_text_ctx);
-        assertEquals(5, params.greedy.best_of);
+        assertEquals(2, params.greedy.best_of);
    }

    @Test
@ -75,11 +72,11 @@ class WhisperCppTest {
        byte[] b = new byte[audioInputStream.available()];
        float[] floats = new float[b.length / 2];

-        //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
+//        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
        params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
        params.print_progress = CBool.FALSE;
-        //params.initial_prompt = "and so my fellow Americans um, like";
+//        params.initial_prompt = "and so my fellow Americans um, like";


        try {
@ -102,43 +99,4 @@ class WhisperCppTest {
            audioInputStream.close();
        }
    }
-
-    @Test
-    void testFullTranscribeWithTime() throws Exception {
-        if (!modelInitialised) {
-            System.out.println("Model not initialised, skipping test");
-            return;
-        }
-
-        // Given
-        File file = new File(System.getProperty("user.dir"), "../../samples/jfk.wav");
-        AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(file);
-
-        byte[] b = new byte[audioInputStream.available()];
-        float[] floats = new float[b.length / 2];
-
-        //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
-        params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
-        params.print_progress = CBool.FALSE;
-        //params.initial_prompt = "and so my fellow Americans um, like";
-
-        try {
-            audioInputStream.read(b);
-
-            for (int i = 0, j = 0; i < b.length; i += 2, j++) {
-                int intSample = (int) (b[i + 1]) << 8 | (int) (b[i]) & 0xFF;
-                floats[j] = intSample / 32767.0f;
-            }
-
-            List<WhisperSegment> segments = whisper.fullTranscribeWithTime(params, floats);
-            assertTrue(segments.size() > 0, "The size of segments should be greater than 0");
-            for (WhisperSegment segment : segments) {
-                System.out.println(segment);
-            }
-        } finally {
-            audioInputStream.close();
-        }
-    }
-
 }
--- a/bindings/javascript/README.md
+++ b/bindings/javascript/README.md
@ -41,7 +41,7 @@ make publish-npm

 ## Sample run

-```text
+```java
 $ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js

 whisper_model_load: loading model from 'whisper.bin'
@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 | 

 operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...

--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.5",
+  "version": "1.4.3",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -9,7 +9,6 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} ."
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -70,7 +70,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    backend->iface.graph_plan_compute(backend, plan);
 }

-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->iface.graph_compute(backend, cgraph);
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -52,7 +52,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {

    // select which device to run the Core ML model on
    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    // config.computeUnits = MLComputeUnitsCPUAndGPU;
+    config.computeUnits = MLComputeUnitsCPUAndGPU;
    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-    config.computeUnits = MLComputeUnitsAll;
+    //config.computeUnits = MLComputeUnitsAll;

    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,10 +14,6 @@ if (WHISPER_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

-if (WHISPER_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
 # common

 set(TARGET common)
@ -27,8 +23,6 @@ add_library(${TARGET} STATIC
    common.cpp
    common-ggml.h
    common-ggml.cpp
-    grammar-parser.h
-    grammar-parser.cpp
    )

 include(DefaultTargetOptions)
@ -36,7 +30,6 @@ include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE whisper)

 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(${TARGET} PROPERTIES FOLDER "libs")

 if (WHISPER_SDL2)
    # common-sdl
@ -54,63 +47,27 @@ if (WHISPER_SDL2)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})

    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()

-# add json lib
-add_library(json_cpp INTERFACE json.hpp)
-set_target_properties(json_cpp PROPERTIES FOLDER "libs")
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
-    set_target_properties(libmain PROPERTIES FOLDER "libs")
    add_subdirectory(stream.wasm)
-    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
-    set_target_properties(libcommand PROPERTIES FOLDER "libs")
    add_subdirectory(talk.wasm)
-    set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
-    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
    add_subdirectory(addon.node)
-    set_target_properties(addon.node PROPERTIES FOLDER "examples")
 else()
    add_subdirectory(main)
-    set_target_properties(main PROPERTIES FOLDER "examples")
-if (WHISPER_SDL2)
    add_subdirectory(stream)
-    set_target_properties(stream PROPERTIES FOLDER "examples")
-endif (WHISPER_SDL2)
-    add_subdirectory(server)
-    set_target_properties(server PROPERTIES FOLDER "examples")
-if (WHISPER_SDL2)
    add_subdirectory(command)
-    set_target_properties(command PROPERTIES FOLDER "examples")
-endif (WHISPER_SDL2)
    add_subdirectory(bench)
-    set_target_properties(bench PROPERTIES FOLDER "examples")
    add_subdirectory(quantize)
-    set_target_properties(quantize PROPERTIES FOLDER "examples")
-if (WHISPER_SDL2)
    add_subdirectory(talk)
-    set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
-    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
-    set_target_properties(lsp PROPERTIES FOLDER "examples")
-    if (LLAMA_SYCL)
-        add_subdirectory(sycl)
-        set_target_properties(sycl PROPERTIES FOLDER "examples")
-    endif()
-endif (WHISPER_SDL2)
 endif()
-
-if (WHISPER_SDL2)
-    add_subdirectory(wchess)
-    set_target_properties(wchess PROPERTIES FOLDER "examples")
-endif (WHISPER_SDL2)
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -52,6 +52,27 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
 void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -83,8 +104,8 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
        if (params.diarize && pcmf32s.size() == 2) {
            const int64_t n_samples = pcmf32s[0].size();

-            const int64_t is0 = timestamp_to_sample(t0, n_samples, WHISPER_SAMPLE_RATE);
-            const int64_t is1 = timestamp_to_sample(t1, n_samples, WHISPER_SAMPLE_RATE);
+            const int64_t is0 = timestamp_to_sample(t0, n_samples);
+            const int64_t is1 = timestamp_to_sample(t1, n_samples);

            double energy0 = 0.0f;
            double energy1 = 0.0f;
@ -133,7 +154,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

@ -211,8 +232,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

            wparams.initial_prompt   = params.prompt.c_str();

-            wparams.no_timestamps    = params.no_timestamps;
-
            whisper_print_user_data user_data = { &params, &pcmf32s };

            // this callback is called on each new segment
@ -300,13 +319,11 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  std::string model = whisper_params.Get("model").As<Napi::String>();
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
  bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
-  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();

  params.language = language;
  params.model = model;
  params.fname_inp.emplace_back(input);
  params.use_gpu = use_gpu;
-  params.no_timestamps = no_timestamps;

  Napi::Function callback = info[1].As<Napi::Function>();
  Worker* worker = new Worker(callback, params);
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -8,7 +8,7 @@
 // command-line parameters
 struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t what = 0; // what to benchmark: 0 - whisper encoder, 1 - memcpy, 2 - ggml_mul_mat
+    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat

    std::string model = "models/ggml-base.en.bin";

@ -58,7 +58,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 int whisper_bench_full(const whisper_params & params) {
    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -81,7 +81,7 @@ int whisper_bench_full(const whisper_params & params) {
    }
    // heat encoder
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode: %d\n", ret);
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

@ -90,13 +90,13 @@ int whisper_bench_full(const whisper_params & params) {

    // prompt heat
    if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to decode: %d\n", ret);
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

    // text-generation heat
    if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to decode: %d\n", ret);
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

@ -104,30 +104,20 @@ int whisper_bench_full(const whisper_params & params) {

    // actual run
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode: %d\n", ret);
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

-    // text-generation
-    for (int i = 0; i < 256; i++) {
-        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
-            fprintf(stderr, "error: failed to decode: %d\n", ret);
-            return 4;
-        }
-    }
-
-    // batched decoding
-    for (int i = 0; i < 64; i++) {
-        if (int ret = whisper_decode(ctx, tokens, 5, 0, params.n_threads) != 0) {
-            fprintf(stderr, "error: failed to decode: %d\n", ret);
-            return 4;
-        }
-    }
-
-    // prompt processing
    for (int i = 0; i < 16; i++) {
        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
-            fprintf(stderr, "error: failed to decode: %d\n", ret);
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            return 4;
+        }
+    }
+
+    for (int i = 0; i < 256; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
            return 4;
        }
    }
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -37,13 +37,9 @@ https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9
 The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

 ```bash
-# Install SDL2
-# On Debian based linux distributions:
+# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev

-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
 # Install SDL2 on Mac OS
 brew install sdl2

--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -9,7 +9,6 @@
 #include "common-sdl.h"
 #include "common.h"
 #include "whisper.h"
-#include "grammar-parser.h"

 #include <sstream>
 #include <cassert>
@ -31,12 +30,8 @@ struct whisper_params {
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;

-    float vad_thold  = 0.6f;
-    float freq_thold = 100.0f;
-
-    float grammar_penalty = 100.0f;
-
-    grammar_parser::parse_state grammar_parsed;
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;

    bool speed_up      = false;
    bool translate     = false;
@ -50,11 +45,6 @@ struct whisper_params {
    std::string fname_out;
    std::string commands;
    std::string prompt;
-    std::string context;
-    std::string grammar;
-
-    // A regular expression that matches tokens to suppress
-    std::string suppress_regex;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -85,10 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-cmd" || arg == "--commands")      { params.commands      = argv[++i]; }
        else if (arg == "-p"   || arg == "--prompt")        { params.prompt        = argv[++i]; }
-        else if (arg == "-ctx" || arg == "--context")       { params.context       = argv[++i]; }
-        else if (                 arg == "--grammar")       { params.grammar       = argv[++i]; }
-        else if (                 arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
-        else if (                 arg == "--suppress-regex") { params.suppress_regex = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -123,31 +109,16 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "  -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n",             params.commands.c_str());
    fprintf(stderr, "  -p,         --prompt         [%-7s] the required activation prompt\n",              params.prompt.c_str());
-    fprintf(stderr, "  -ctx,       --context        [%-7s] sample text to help the transcription\n",       params.context.c_str());
-    fprintf(stderr, "  --grammar GRAMMAR            [%-7s] GBNF grammar to guide decoding\n",              params.grammar.c_str());
-    fprintf(stderr, "  --grammar-penalty N          [%-7.1f] scales down logits of nongrammar tokens\n",   params.grammar_penalty);
-    fprintf(stderr, "  --suppress-regex REGEX       [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
    fprintf(stderr, "\n");
 }

-std::string transcribe(
-                 whisper_context * ctx,
-            const whisper_params & params,
-        const std::vector<float> & pcmf32,
-               const std::string & grammar_rule,
-                           float & logprob_min,
-                           float & logprob_sum,
-                             int & n_tokens,
-                         int64_t & t_ms) {
+std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

-    logprob_min = 0.0f;
-    logprob_sum = 0.0f;
-    n_tokens    = 0;
+    prob = 0.0f;
    t_ms = 0;

-    //whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
@ -155,43 +126,19 @@ std::string transcribe(
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
-    wparams.no_timestamps    = params.no_timestamps;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;

-    wparams.audio_ctx = params.audio_ctx;
-    wparams.speed_up  = params.speed_up;
-
-    wparams.temperature     = 0.4f;
-    wparams.temperature_inc = 1.0f;
-    wparams.greedy.best_of  = 5;
-
-    wparams.beam_search.beam_size = 5;
-
-    wparams.initial_prompt = params.context.data();
-
-    wparams.suppress_regex = params.suppress_regex.c_str();
-
-    const auto & grammar_parsed = params.grammar_parsed;
-    auto grammar_rules = grammar_parsed.c_rules();
-
-    if (!params.grammar_parsed.rules.empty() && !grammar_rule.empty()) {
-        if (grammar_parsed.symbol_ids.find(grammar_rule) == grammar_parsed.symbol_ids.end()) {
-            fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, grammar_rule.c_str());
-        } else {
-            wparams.grammar_rules   = grammar_rules.data();
-            wparams.n_grammar_rules = grammar_rules.size();
-            wparams.i_start_rule    = grammar_parsed.symbol_ids.at(grammar_rule);
-            wparams.grammar_penalty = params.grammar_penalty;
-        }
-    }
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }

+    int prob_n = 0;
    std::string result;

    const int n_segments = whisper_full_n_segments(ctx);
@ -200,17 +147,19 @@ std::string transcribe(

        result += text;

-        const int n = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n; ++j) {
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);

-            if(token.plog > 0.0f) exit(0);
-            logprob_min = std::min(logprob_min, token.plog);
-            logprob_sum += token.plog;
-            ++n_tokens;
+            prob += token.p;
+            ++prob_n;
        }
    }

+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

@ -301,7 +250,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
        fprintf(stderr, " ]\n");
    }

-    std::string k_prompt = "select one from the available words: ";
+    std::string  k_prompt = "select one from the available words: ";
    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
        if (i > 0) {
            k_prompt += ", ";
@ -469,9 +418,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
    bool is_running = true;
    bool ask_prompt = true;

-    float logprob_min = 0.0f;
-    float logprob_sum = 0.0f;
-    int   n_tokens    = 0;
+    float prob = 0.0f;

    std::vector<float> pcmf32_cur;

@ -509,7 +456,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
                // detect the commands
                audio.get(params.command_ms, pcmf32_cur);

-                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "", logprob_min, logprob_sum, n_tokens, t_ms));
+                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));

                const auto words = get_words(txt);

@ -545,27 +492,18 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi

 // general-purpose mode
 // freely transcribe the voice into text
-int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;

-    float logprob_min0 = 0.0f;
-    float logprob_min  = 0.0f;
-
-    float logprob_sum0 = 0.0f;
-    float logprob_sum  = 0.0f;
-
-    int n_tokens0 = 0;
-    int n_tokens  = 0;
+    float prob0 = 0.0f;
+    float prob  = 0.0f;

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    std::string k_prompt = "Ok Whisper, start listening for commands.";
-    if (!params.prompt.empty()) {
-        k_prompt = params.prompt;
-    }
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";

    fprintf(stderr, "\n");
    fprintf(stderr, "%s: general-purpose mode\n", __func__);
@ -598,11 +536,9 @@ int process_general_transcription(struct whisper_context * ctx, audio_async & au
                    // wait for activation phrase
                    audio.get(params.prompt_ms, pcmf32_cur);

-                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "prompt", logprob_min0, logprob_sum0, n_tokens0, t_ms));
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));

-                    const float p = 100.0f * std::exp(logprob_min0);
-
-                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms, p = %.2f%%)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms, p);
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);

                    const float sim = similarity(txt, k_prompt);

@ -623,30 +559,19 @@ int process_general_transcription(struct whisper_context * ctx, audio_async & au
                    // we have heard the activation phrase, now detect the commands
                    audio.get(params.command_ms, pcmf32_cur);

-                    //printf("len prompt:  %.4f\n", pcmf32_prompt.size() / (float) WHISPER_SAMPLE_RATE);
-                    //printf("len command: %.4f\n", pcmf32_cur.size() / (float) WHISPER_SAMPLE_RATE);
-
-                    // prepend 3 second of silence
-                    pcmf32_cur.insert(pcmf32_cur.begin(), 3.0f*WHISPER_SAMPLE_RATE, 0.0f);
-
                    // prepend the prompt audio
                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());

-                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "root", logprob_min, logprob_sum, n_tokens, t_ms));
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));

-                    //const float p = 100.0f * std::exp((logprob - logprob0) / (n_tokens - n_tokens0));
-                    const float p = 100.0f * std::exp(logprob_min);
+                    prob = 100.0f*(prob - prob0);

                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());

                    // find the prompt in the text
                    float best_sim = 0.0f;
                    size_t best_len = 0;
-                    for (size_t n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
-                        if (n >= txt.size()) {
-                            break;
-                        }
-
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
                        const auto prompt = txt.substr(0, n);

                        const float sim = similarity(prompt, k_prompt);
@ -659,16 +584,9 @@ int process_general_transcription(struct whisper_context * ctx, audio_async & au
                        }
                    }

-                    fprintf(stdout, "%s:   DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
-                    if (best_len == 0) {
-                        fprintf(stdout, "%s: WARNING: command not recognized, try again\n", __func__);
-                    } else {
-                        // cut the prompt from the decoded text
-                        const std::string command = ::trim(txt.substr(best_len));
-
-                        fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
-                    }
+                    const std::string command = ::trim(txt.substr(best_len));

+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                    fprintf(stdout, "\n");
                }

@ -695,7 +613,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -736,36 +654,12 @@ int main(int argc, char ** argv) {

    int  ret_val = 0;

-    if (!params.grammar.empty()) {
-        auto & grammar = params.grammar_parsed;
-        if (is_file_exist(params.grammar.c_str())) {
-            // read grammar from file
-            std::ifstream ifs(params.grammar.c_str());
-            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
-            grammar = grammar_parser::parse(txt.c_str());
-        } else {
-            // read grammar from string
-            grammar = grammar_parser::parse(params.grammar.c_str());
-        }
-
-        // will be empty (default) if there are parse errors
-        if (grammar.rules.empty()) {
-            ret_val = 1;
-        } else {
-            fprintf(stderr, "%s: grammar:\n", __func__);
-            grammar_parser::print_grammar(stderr, grammar);
-            fprintf(stderr, "\n");
-        }
-    }
-
-    if (ret_val == 0) {
-        if (!params.commands.empty()) {
-            ret_val = process_command_list(ctx, audio, params);
-        } else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
-            ret_val = always_prompt_transcription(ctx, audio, params);
-        } else {
-            ret_val = process_general_transcription(ctx, audio, params);
-        }
+    if (!params.commands.empty()) {
+        ret_val = process_command_list(ctx, audio, params);
+    } else if (!params.prompt.empty()) {
+        ret_val = always_prompt_transcription(ctx, audio, params);
+    } else {
+        ret_val = process_general_transcription(ctx, audio, params);
    }

    audio.pause();
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -9,11 +9,6 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
-    {"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
-    {"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
-    {"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
-    {"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
-    {"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
 };

 void ggml_print_ftypes(FILE * fp) {
@ -53,24 +48,15 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
-        case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
-        case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
-        case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
-        case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
-        case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:
-        case GGML_FTYPE_MOSTLY_IQ2_XS:
-        case GGML_FTYPE_MOSTLY_IQ2_S:
-        case GGML_FTYPE_MOSTLY_IQ3_XXS:
-        case GGML_FTYPE_MOSTLY_IQ3_S:
-        case GGML_FTYPE_MOSTLY_IQ1_S:
-        case GGML_FTYPE_MOSTLY_IQ4_NL:
-        case GGML_FTYPE_MOSTLY_IQ4_XS:
-        case GGML_FTYPE_MOSTLY_IQ1_M:
+        case GGML_FTYPE_MOSTLY_Q2_K:
+        case GGML_FTYPE_MOSTLY_Q3_K:
+        case GGML_FTYPE_MOSTLY_Q4_K:
+        case GGML_FTYPE_MOSTLY_Q5_K:
+        case GGML_FTYPE_MOSTLY_Q6_K:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -91,6 +77,8 @@ bool ggml_common_quantize_0(
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;

+    std::vector<int64_t> hist_all(1 << 4, 0);
+
    while (true) {
        int32_t n_dims;
        int32_t length;
@ -175,38 +163,41 @@ bool ggml_common_quantize_0(
            work.resize(nelements); // for quantization

            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
                    {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
-                case GGML_TYPE_I64:
-                case GGML_TYPE_F64:
                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
                case GGML_TYPE_Q8_K:
-                case GGML_TYPE_IQ2_XXS:
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_IQ2_S:
-                case GGML_TYPE_IQ3_XXS:
-                case GGML_TYPE_IQ3_S:
-                case GGML_TYPE_IQ1_S:
-                case GGML_TYPE_IQ4_NL:
-                case GGML_TYPE_IQ4_XS:
-                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
@ -217,7 +208,15 @@ bool ggml_common_quantize_0(
            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;

-            printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@ -230,5 +229,18 @@ bool ggml_common_quantize_0(
    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
    return true;
 }
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -139,13 +139,10 @@ void audio_async::callback(uint8_t * stream, int len) {
        return;
    }

-    size_t n_samples = len / sizeof(float);
+    const size_t n_samples = len / sizeof(float);

-    if (n_samples > m_audio.size()) {
-        n_samples = m_audio.size();
-
-        stream += (len - (n_samples * sizeof(float)));
-    }
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));

    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);

@ -156,7 +153,7 @@ void audio_async::callback(uint8_t * stream, int len) {
            const size_t n0 = m_audio.size() - m_audio_pos;

            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));

            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -41,6 +41,7 @@ private:
    std::mutex       m_mutex;

    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -19,11 +19,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#ifdef _WIN32
-#include <fcntl.h>
-#include <io.h>
-#endif
-
 // Function to check if the next argument exists
 std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
    if (i + 1 < argc && argv[i + 1][0] != '-') {
@ -620,31 +615,12 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(

 }

-bool is_wav_buffer(const std::string buf) {
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
-        return false;
-    }
-
-    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
-    if (chunk_size + 8 != buf.size()) {
-        return false;
-    }
-
-    return true;
-}
-
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin

    if (fname == "-") {
        {
-            #ifdef _WIN32
-            _setmode(_fileno(stdin), _O_BINARY);
-            #endif
-
            uint8_t buf[1024];
            while (true)
            {
@ -663,12 +639,6 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
    }
-    else if (is_wav_buffer(fname)) {
-        if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
-            return false;
-        }
-    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
@ -676,25 +646,21 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

    if (wav.channels != 1 && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-        drwav_uninit(&wav);
        return false;
    }

    if (stereo && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-        drwav_uninit(&wav);
        return false;
    }

    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
-        drwav_uninit(&wav);
        return false;
    }

    if (wav.bitsPerSample != 16) {
        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-        drwav_uninit(&wav);
        return false;
    }

@ -849,48 +815,3 @@ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
-}
-
-bool is_file_exist(const char *fileName)
-{
-    std::ifstream infile(fileName);
-    return infile.good();
-}
-
-bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
-{
-    std::ofstream speak_file(path.c_str());
-    if (speak_file.fail()) {
-        fprintf(stderr, "%s: failed to open speak_file\n", __func__);
-        return false;
-    } else {
-        speak_file.write(text.c_str(), text.size());
-        speak_file.close();
-        int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
-        if (ret != 0) {
-            fprintf(stderr, "%s: failed to speak\n", __func__);
-            return false;
-        }
-    }
-    return true;
-}
--- a/examples/common.h
+++ b/examples/common.h
@ -135,11 +135,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 // Audio utils
 //

-// Check if a buffer is a WAV audio file
-bool is_wav_buffer(const std::string buf);
-
 // Read WAV audio file and store the PCM data into pcmf32
-// fname can be a buffer of WAV data instead of a filename
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(
@ -281,31 +277,3 @@ struct sam_params {
 bool sam_params_parse(int argc, char ** argv, sam_params & params);

 void sam_print_usage(int argc, char ** argv, const sam_params & params);
-
-//
-// Terminal utils
-//
-
-
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
-const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
-};
-
-//
-// Other utils
-//
-
-// convert timestamp to string, 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false);
-
-// given a timestamp get the sample
-int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
-
-// check if file exists using ifstream
-bool is_file_exist(const char *fileName);
-
-// write text to file, and call system("command voice_id file")
-bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -1,423 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from whisper.cpp
-    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
-        return result.first->second;
-    }
-
-    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<whisper_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
-    }
-
-    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<whisper_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({WHISPER_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum whisper_gretype start_type = WHISPER_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = WHISPER_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum whisper_gretype type = last_sym_start < out_elements.size()
-                        ? WHISPER_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({WHISPER_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
-                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
-                }
-
-                // apply transformation to previous symbol (last_sym_start to end) according to
-                // rewrite rules:
-                // S* --> S' ::= S S' |
-                // S+ --> S' ::= S S' | S
-                // S? --> S' ::= S |
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                std::vector<whisper_grammar_element> sub_rule;
-                // add preceding symbol to generated rule
-                sub_rule.insert(
-                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                if (*pos == '*' || *pos == '+') {
-                    // cause generated rule to recurse
-                    sub_rule.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
-                }
-                // mark start of alternate def
-                sub_rule.push_back({WHISPER_GRETYPE_ALT, 0});
-                if (*pos == '+') {
-                    // add preceding symbol as alternate only for '+' (otherwise empty)
-                    sub_rule.insert(
-                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                }
-                sub_rule.push_back({WHISPER_GRETYPE_END, 0});
-                add_rule(state, sub_rule_id, sub_rule);
-
-                // in original rule, replace previous symbol with reference to generated rule
-                out_elements.resize(last_sym_start);
-                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
-
-                pos = parse_space(pos + 1, is_nested);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<whisper_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({WHISPER_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({WHISPER_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    bool is_char_element(whisper_grammar_element elem) {
-        switch (elem.type) {
-            case WHISPER_GRETYPE_CHAR:           return true;
-            case WHISPER_GRETYPE_CHAR_NOT:       return true;
-            case WHISPER_GRETYPE_CHAR_ALT:       return true;
-            case WHISPER_GRETYPE_CHAR_RNG_UPPER: return true;
-            default:                           return false;
-        }
-    }
-
-    void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case WHISPER_GRETYPE_END:            fprintf(file, "END");            break;
-                case WHISPER_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case WHISPER_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case WHISPER_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case WHISPER_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case WHISPER_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case WHISPER_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-            }
-            switch (elem.type) {
-                case WHISPER_GRETYPE_END:
-                case WHISPER_GRETYPE_ALT:
-                case WHISPER_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case WHISPER_GRETYPE_CHAR:
-                case WHISPER_GRETYPE_CHAR_NOT:
-                case WHISPER_GRETYPE_CHAR_RNG_UPPER:
-                case WHISPER_GRETYPE_CHAR_ALT:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<whisper_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != WHISPER_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with WHISPER_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            whisper_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case WHISPER_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case WHISPER_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case WHISPER_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case WHISPER_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case WHISPER_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case WHISPER_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "WHISPER_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case WHISPER_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "WHISPER_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case WHISPER_GRETYPE_CHAR_ALT:
-                    case WHISPER_GRETYPE_CHAR_RNG_UPPER:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (auto kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
-        std::vector<const whisper_grammar_element *> ret;
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by whisper.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "whisper.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                   symbol_ids;
-        std::vector<std::vector<whisper_grammar_element>> rules;
-
-        std::vector<const whisper_grammar_element *>      c_rules() const;
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -22,7 +22,6 @@ var printTextarea = (function() {
 async function clearCache() {
    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
        indexedDB.deleteDatabase(dbName);
-        location.reload();
    }
 }

--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )

 # list available models
 function list_models {
--- a/examples/lsp/CMakeLists.txt
+++ b/examples/lsp/CMakeLists.txt
@ -5,5 +5,5 @@ if (WHISPER_SDL2)

    include(DefaultTargetOptions)

-    target_link_libraries(${TARGET} PRIVATE common json_cpp common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/lsp/json.hpp
+++ b/examples/lsp/json.hpp
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -435,7 +435,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // init audio
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -17,37 +17,28 @@ options:
  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -sow,      --split-on-word     [false  ] split on word rather than on token
  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [5      ] beam size for beam search
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
  -tr,       --translate         [false  ] translate from source language to english
  -di,       --diarize           [false  ] stereo audio diarization
-  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
  -otxt,     --output-txt        [false  ] output result in a text file
  -ovtt,     --output-vtt        [false  ] output result in a vtt file
  -osrt,     --output-srt        [false  ] output result in a srt file
-  -olrc,     --output-lrc        [false  ] output result in a lrc file
  -owts,     --output-words      [false  ] output script for generating karaoke video
-  -fp,       --font-path         [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
-  -ojf,      --output-json-full  [false  ] include more information in the JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [false  ] do not print timestamps
+  -nt,       --no-timestamps     [true   ] do not print timestamps
  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-  -dl,       --detect-language   [false  ] exit after automatically detecting language
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
-  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  -ls,       --log-score         [false  ] log best decoder scores of tokens
-  -ng,       --no-gpu            [false  ] disable GPU
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,12 +1,10 @@
 #include "common.h"

 #include "whisper.h"
-#include "grammar-parser.h"

 #include <cmath>
 #include <fstream>
 #include <cstdio>
-#include <regex>
 #include <string>
 #include <thread>
 #include <vector>
@ -16,6 +14,34 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+// Lowest is red, middle is yellow, highest is green.
+const std::vector<std::string> k_colors = {
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+};
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
 // helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
@ -28,22 +54,20 @@ void replace_all(std::string & s, const std::string & search, const std::string

 // command-line parameters
 struct whisper_params {
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors  = 1;
-    int32_t offset_t_ms   = 0;
-    int32_t offset_n      = 0;
-    int32_t duration_ms   = 0;
-    int32_t progress_step = 5;
-    int32_t max_context   = -1;
-    int32_t max_len       = 0;
-    int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
-    int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
-    int32_t audio_ctx     = 0;
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
+    int32_t progress_step =  5;
+    int32_t max_context  = -1;
+    int32_t max_len      =  0;
+    int32_t best_of      =  2;
+    int32_t beam_size    = -1;

-    float word_thold      =  0.01f;
-    float entropy_thold   =  2.40f;
-    float logprob_thold   = -1.00f;
-    float grammar_penalty = 100.0f;
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;

    bool speed_up        = false;
    bool debug_mode      = false;
@ -61,7 +85,6 @@ struct whisper_params {
    bool output_jsn      = false;
    bool output_jsn_full = false;
    bool output_lrc      = false;
-    bool no_prints       = false;
    bool print_special   = false;
    bool print_colors    = false;
    bool print_progress  = false;
@ -73,35 +96,18 @@ struct whisper_params {
    std::string prompt;
    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model     = "models/ggml-base.en.bin";
-    std::string grammar;
-    std::string grammar_rule;

    // [TDRZ] speaker turn string
    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line

-    // A regular expression that matches tokens to suppress
-    std::string suppress_regex;
-
    std::string openvino_encode_device = "CPU";

-    std::string dtw = "";
-
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_out = {};
-
-    grammar_parser::parse_state grammar_parsed;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

-char* whisper_param_turn_lowercase(char* in){
-    int string_len = strlen(in);
-    for(int i = 0; i < string_len; i++){
-        *(in+i) = tolower((unsigned char)*(in+i));
-    }
-    return in;
-}
-
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -129,7 +135,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
@ -150,24 +155,18 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
-        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = whisper_param_turn_lowercase(argv[++i]); }
+        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
-        else if (arg == "-dtw"  || arg == "--dtw")             { params.dtw             = argv[++i]; }
-        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score       = true; }
-        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
-        else if (                  arg == "--suppress-regex")  { params.suppress_regex = argv[++i]; }
-        else if (                  arg == "--grammar")         { params.grammar         = argv[++i]; }
-        else if (                  arg == "--grammar-rule")    { params.grammar_rule    = argv[++i]; }
-        else if (                  arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
+        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
+        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -194,7 +193,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
@ -214,24 +212,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt (max n_text_ctx/2 tokens)\n",       params.prompt.c_str());
+    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
-    fprintf(stderr, "  -dtw MODEL --dtw MODEL         [%-7s] compute token-level timestamps\n",                 params.dtw.c_str());
    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  --suppress-regex REGEX         [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
-    fprintf(stderr, "  --grammar GRAMMAR              [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
-    fprintf(stderr, "  --grammar-rule RULE            [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
-    fprintf(stderr, "  --grammar-penalty N            [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
    fprintf(stderr, "\n");
 }

@ -246,8 +238,8 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();

-    const int64_t is0 = timestamp_to_sample(t0, n_samples, WHISPER_SAMPLE_RATE);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples, WHISPER_SAMPLE_RATE);
+    const int64_t is0 = timestamp_to_sample(t0, n_samples);
+    const int64_t is1 = timestamp_to_sample(t1, n_samples);

    double energy0 = 0.0f;
    double energy1 = 0.0f;
@ -671,8 +663,7 @@ bool output_json(
                                    times_o(token.t0, token.t1, false);
                                }
                                value_i("id", token.id, false);
-                                value_f("p", token.p, false);
-                                value_f("t_dtw", token.t_dtw, true);
+                                value_f("p", token.p, true);
                            end_obj(j == (n - 1));
                        }
                        end_arr(!params.diarize && !params.tinydiarize);
@ -861,59 +852,14 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-
-void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
-
 int main(int argc, char ** argv) {
    whisper_params params;

-    // If the only argument starts with "@", read arguments line-by-line
-    // from the given file.
-    std::vector<std::string> vec_args;
-    if (argc == 2 && argv != nullptr && argv[1] != nullptr && argv[1][0] == '@') {
-        // Save the name of the executable.
-        vec_args.push_back(argv[0]);
-
-        // Open the response file.
-        char const * rspfile = argv[1] + sizeof(char);
-        std::ifstream fin(rspfile);
-        if (fin.is_open() == false) {
-            fprintf(stderr, "error: response file '%s' not found\n", rspfile);
-            return 1;
-        }
-
-        // Read the entire response file.
-        std::string line;
-        while (std::getline(fin, line)) {
-            vec_args.push_back(line);
-        }
-
-        // Use the contents of the response file as the command-line arguments.
-        argc = static_cast<int>(vec_args.size());
-        argv = static_cast<char **>(alloca(argc * sizeof (char *)));
-        for (int i = 0; i < argc; ++i) {
-            argv[i] = const_cast<char *>(vec_args[i].c_str());
-        }
-    }
-
    if (whisper_params_parse(argc, argv, params) == false) {
        whisper_print_usage(argc, argv, params);
        return 1;
    }

-    // remove non-existent files
-    for (auto it = params.fname_inp.begin(); it != params.fname_inp.end();) {
-        const auto fname_inp = it->c_str();
-
-        if (*it != "-" && !is_file_exist(fname_inp)) {
-            fprintf(stderr, "error: input file not found '%s'\n", fname_inp);
-            it = params.fname_inp.erase(it);
-            continue;
-        }
-
-        it++;
-    }
-
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
@ -932,37 +878,11 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    if (params.no_prints) {
-        whisper_log_set(cb_log_disable, NULL);
-    }
-
    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

-    if (!params.dtw.empty()) {
-        cparams.dtw_token_timestamps = true;
-        cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
-
-        if (params.dtw == "tiny")      cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
-        if (params.dtw == "tiny.en")   cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
-        if (params.dtw == "base")      cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
-        if (params.dtw == "base.en")   cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
-        if (params.dtw == "small")     cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
-        if (params.dtw == "small.en")  cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
-        if (params.dtw == "medium")    cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
-        if (params.dtw == "medium.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
-        if (params.dtw == "large.v1")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
-        if (params.dtw == "large.v2")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
-        if (params.dtw == "large.v3")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
-
-        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
-            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
-            return 3;
-        }
-    }
-
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

    if (ctx == nullptr) {
@ -973,29 +893,6 @@ int main(int argc, char ** argv) {
    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);

-    if (!params.grammar.empty()) {
-        auto & grammar = params.grammar_parsed;
-        if (is_file_exist(params.grammar.c_str())) {
-            // read grammar from file
-            std::ifstream ifs(params.grammar.c_str());
-            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
-            grammar = grammar_parser::parse(txt.c_str());
-        } else {
-            // read grammar from string
-            grammar = grammar_parser::parse(params.grammar.c_str());
-        }
-
-        // will be empty (default) if there are parse errors
-        if (grammar.rules.empty()) {
-            fprintf(stderr, "error: failed to parse grammar \"%s\"\n", params.grammar.c_str());
-            return 4;
-        } else {
-            fprintf(stderr, "%s: grammar:\n", __func__);
-            grammar_parser::print_grammar(stderr, grammar);
-            fprintf(stderr, "\n");
-        }
-    }
-
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
 		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -1008,28 +905,29 @@ int main(int argc, char ** argv) {
            continue;
        }

-        if (!whisper_is_multilingual(ctx)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        if (params.detect_language) {
-            params.language = "auto";
-        }
-
-        if (!params.no_prints) {
-            // print system information
+        // print system information
+        {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
+        }

-            // print some info about the processing
+        // print some info about the processing
+        {
            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
+            if (!whisper_is_multilingual(ctx)) {
+                if (params.language != "en" || params.translate) {
+                    params.language = "en";
+                    params.translate = false;
+                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                }
+            }
+            if (params.detect_language) {
+                params.language = "auto";
+            }
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors, params.beam_size, params.best_of,
+                    params.n_threads, params.n_processors,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.tinydiarize ? "tdrz = 1, " : "",
@ -1042,8 +940,7 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            const bool use_grammar = (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty());
-            wparams.strategy = (params.beam_size > 1 || use_grammar) ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;

            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
@ -1061,15 +958,12 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
-            wparams.audio_ctx        = params.audio_ctx;

            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]

-            wparams.suppress_regex   = params.suppress_regex.c_str();
-
            wparams.initial_prompt   = params.prompt.c_str();

            wparams.greedy.best_of        = params.best_of;
@ -1079,24 +973,8 @@ int main(int argc, char ** argv) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

-            wparams.no_timestamps    = params.no_timestamps;
-
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

-            const auto & grammar_parsed = params.grammar_parsed;
-            auto grammar_rules = grammar_parsed.c_rules();
-
-            if (use_grammar) {
-                if (grammar_parsed.symbol_ids.find(params.grammar_rule) == grammar_parsed.symbol_ids.end()) {
-                    fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, params.grammar_rule.c_str());
-                } else {
-                    wparams.grammar_rules = grammar_rules.data();
-                    wparams.n_grammar_rules = grammar_rules.size();
-                    wparams.i_start_rule = grammar_parsed.symbol_ids.at(params.grammar_rule);
-                    wparams.grammar_penalty = params.grammar_penalty;
-                }
-            }
-
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
--- a/examples/python/test_whisper_processor.py
+++ b/examples/python/test_whisper_processor.py
@ -1,7 +0,0 @@
-import whisper_processor
-
-try:
-    result = whisper_processor.process_audio("./audio/wake_word_detected16k.wav", "base.en")
-    print(result)
-except Exception as e:
-    print(f"Error: {e}")
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -1,54 +0,0 @@
-import subprocess
-import sys
-import os
-
-def process_audio(wav_file, model_name="base.en"):
-    """
-    Processes an audio file using a specified model and returns the processed string.
-
-    :param wav_file: Path to the WAV file
-    :param model_name: Name of the model to use
-    :return: Processed string output from the audio processing
-    :raises: Exception if an error occurs during processing
-    """
-
-    model = f"./models/ggml-{model_name}.bin"
-
-    # Check if the file exists
-    if not os.path.exists(model):
-        raise FileNotFoundError(f"Model file not found: {model} \n\nDownload a model with this command:\n\n> bash ./models/download-ggml-model.sh {model_name}\n\n")
-
-    if not os.path.exists(wav_file):
-        raise FileNotFoundError(f"WAV file not found: {wav_file}")
-
-    full_command = f"./main -m {model} -f {wav_file} -np -nt"
-
-    # Execute the command
-    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-    # Get the output and error (if any)
-    output, error = process.communicate()
-
-    if error:
-        raise Exception(f"Error processing audio: {error.decode('utf-8')}")
-
-    # Process and return the output string
-    decoded_str = output.decode('utf-8').strip()
-    processed_str = decoded_str.replace('[BLANK_AUDIO]', '').strip()
-
-    return processed_str
-
-def main():
-    if len(sys.argv) >= 2:
-        wav_file = sys.argv[1]
-        model_name = sys.argv[2] if len(sys.argv) == 3 else "base.en"
-        try:
-            result = process_audio(wav_file, model_name)
-            print(result)
-        except Exception as e:
-            print(f"Error: {e}")
-    else:
-        print("Usage: python whisper_processor.py <wav_file> [<model_name>]")
-
-if __name__ == "__main__":
-    main()
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,10 +0,0 @@
-set(TARGET server)
-add_executable(${TARGET} server.cpp httplib.h)
-
-include(DefaultTargetOptions)
-
-target_link_libraries(${TARGET} PRIVATE common json_cpp whisper ${CMAKE_THREAD_LIBS_INIT})
-
-if (WIN32)
-    target_link_libraries(${TARGET} PRIVATE ws2_32)
-endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -1,69 +0,0 @@
-# whisper.cpp http server
-
-Simple http server. WAV Files are passed to the inference model via http requests.
-
-https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-afe5e4594b8f
-
-## Usage
-
-```
-./server -h
-
-usage: ./bin/server [options]
-
-options:
-  -h,        --help              [default] show this help message and exit
-  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -sow,      --split-on-word     [false  ] split on word rather than on token
-  -bo N,     --best-of N         [2      ] number of best candidates to keep
-  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
-  -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
-  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
-  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -ps,       --print-special     [false  ] print special tokens
-  -pc,       --print-colors      [false  ] print colors
-  -pr,       --print-realtime    [false  ] print output in realtime
-  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [false  ] do not print timestamps
-  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-  -dl,       --detect-language   [false  ] exit after automatically detecting language
-             --prompt PROMPT     [       ] initial prompt
-  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
-  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
-  --port PORT,                   [8080   ] Port number for the server
-  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
-```
-
-> [!WARNING]
-> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
-
-## request examples
-
-**/inference**
-```
-curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@<file-path>" \
-F temperature="0.0" \
-F temperature_inc="0.2" \
-F response_format="json"
-```
-
-**/load**
-```
-curl 127.0.0.1:8080/load \
-H "Content-Type: multipart/form-data" \
-F model="<path-to-model-file>"
-```
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -103,11 +103,11 @@ void stream_main(size_t index) {

            {
                const int n_segments = whisper_full_n_segments(ctx);
-                if (n_segments > 0) {
-                    const char * text = whisper_full_get_segment_text(ctx, n_segments - 1);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);

-                    const int64_t t0 = whisper_full_get_segment_t0(ctx, n_segments - 1);
-                    const int64_t t1 = whisper_full_get_segment_t1(ctx, n_segments - 1);
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

                    printf("transcribed: %s\n", text);

--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -4,7 +4,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The `stream` tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```bash
+```java
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -14,7 +14,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a

 Setting the `--step` argument to `0` enables the sliding window mode:

-```bash
+```java
 ./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
 ```

@ -30,21 +30,17 @@ a transcription block that is suitable for parsing.
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

 ```bash
-# Install SDL2
-# On Debian based linux distributions:
+# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev

-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
 # Install SDL2 on Mac OS
 brew install sdl2

 make stream
 ```

-Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
-as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
+Ensure you are at the root of the repo when running `make stream`.  Not within the `examples/stream` dir
+as the libraries needed like `common-sdl.h` are located within `examples`.  Attempting to compile within
 `examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.

 ```bash
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -14,6 +14,20 @@
 #include <fstream>


+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -152,7 +166,7 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -358,7 +372,7 @@ int main(int argc, char ** argv) {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-                        std::string output = "[" + to_timestamp(t0, false) + " --> " + to_timestamp(t1, false) + "]  " + text;
+                        std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "]  " + text;

                        if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
                            output += " [SPEAKER_TURN]";
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -1,9 +0,0 @@
-#  MIT license
-#  Copyright (C) 2024 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-set(TARGET ls-sycl-device)
-add_executable(${TARGET} ls-sycl-device.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -1,47 +0,0 @@
-# llama.cpp/example/sycl
-
-This example program provide the tools for llama.cpp for SYCL on Intel GPU.
-
-## Tool
-
-|Tool Name| Function|Status|
-|-|-|-|
-|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
-
-### ls-sycl-device
-
-List all SYCL devices with ID, compute capability, max work group size, ect.
-
-1. Build the llama.cpp for SYCL for all targets.
-
-2. Enable oneAPI running environment
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. Execute
-
-```
-./build/bin/ls-sycl-device
-```
-
-Check the ID in startup log, like:
-
-```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
-```
-
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -1,19 +0,0 @@
-#  MIT license
-#  Copyright (C) 2024 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-
-#for FP16
-#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
-
-#for FP32
-cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-#build example/main only
-#cmake --build . --config Release --target main
-
-#build all binary
-cmake --build . --config Release -v
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -1,11 +0,0 @@
-/*MIT license
-  Copyright (C) 2024 Intel Corporation
-  SPDX-License-Identifier: MIT
-*/
-
-#include "ggml-sycl.h"
-
-int main(int argc, char ** argv) {
-    ggml_backend_sycl_print_sycl_devices();
-    return 0;
-}
--- a/examples/sycl/run-whisper.sh
+++ b/examples/sycl/run-whisper.sh
@ -1,17 +0,0 @@
-#!/bin/bash
-
-#  MIT license
-#  Copyright (C) 2024 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
-source /opt/intel/oneapi/setvars.sh
-
-if [ $# -gt 0 ]; then
-    export GGML_SYCL_DEVICE=$1
-else
-    export GGML_SYCL_DEVICE=0
-fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-#export GGML_SYCL_DEBUG=1
-./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
--- a/examples/talk-llama/.gitignore
+++ b/examples/talk-llama/.gitignore
@ -1,2 +1 @@
 audio.mp3
-to_speak.txt
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,18 +1,25 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

-    if (WHISPER_CLBLAST)
-        set(CLBLAST_LIBNAME clblast)
-    endif ()
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
+    # TODO: this is temporary
+    #       need to export ggml symbols for MSVC, but too lazy ..
+    add_executable(${TARGET}
+        talk-llama.cpp
+        llama.cpp
+        ../common.cpp
+        ../common-sdl.cpp
+        ../../ggml.c
+        ../../ggml-alloc.c
+        ../../ggml-backend.c
+        ../../ggml-quants.c
+        ../../whisper.cpp)

-    if(WIN32)
-        # It requires Windows 8.1 or later for PrefetchVirtualMemory
-        target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
-    endif()
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
+    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

    include(DefaultTargetOptions)
 endif ()
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -15,13 +15,9 @@ https://github.com/ggerganov/whisper.cpp/assets/1991296/d97a3788-bf2a-4756-9a43-
 The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

 ```bash
-# Install SDL2
-# On Debian based linux distributions:
+# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev

-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
 # Install SDL2 on Mac OS
 brew install sdl2

--- a/examples/talk-llama/eleven-labs.py
+++ b/examples/talk-llama/eleven-labs.py
@ -1,80 +1,20 @@
 import sys
-import argparse
-import textwrap
+import importlib.util

-parser = argparse.ArgumentParser(add_help=False,
-    formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-q", "--quick", action="store_true",
-    help="skip checking the required library")
-
-modes = parser.add_argument_group("action")
-modes.add_argument("inputfile", metavar="TEXTFILE",
-    nargs='?', type=argparse.FileType(), default=sys.stdin,
-    help="read the text file (default: stdin)")
-modes.add_argument("-l", "--list", action="store_true",
-    help="show the list of voices and exit")
-modes.add_argument("-h", "--help", action="help",
-    help="show this help and exit")
-
-selopts = parser.add_argument_group("voice selection")
-selmodes = selopts.add_mutually_exclusive_group()
-selmodes.add_argument("-n", "--name",
-    default="Arnold",
-    help="get a voice object by name (default: Arnold)")
-selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
-    help="get a voice object by number (see --list)")
-selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
-    default=["use case=narration"],
-    help=textwrap.dedent('''\
-        filter voices by labels (default: "use case=narration")
-        this option can be used multiple times
-        filtering will be disabled if the first -f has no "=" (e.g. -f "any")
-        '''))
-
-outmodes = parser.add_argument_group("output")
-outgroup = outmodes.add_mutually_exclusive_group()
-outgroup.add_argument("-s", "--save", metavar="FILE",
-    default="audio.mp3",
-    help="save the TTS to a file (default: audio.mp3)")
-outgroup.add_argument("-p", "--play", action="store_true",
-    help="play the TTS with ffplay")
-
-args = parser.parse_args()
-
-if not args.quick:
-    import importlib.util
-    if importlib.util.find_spec("elevenlabs") is None:
-        print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
-        sys.exit()
-
-from elevenlabs import voices, generate, play, save
-
-if args.filter and "=" in args.filter[0]:
-    voicelist = voices()
-    for f in args.filter:
-        label, value = f.split("=")
-        voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
-    voicelist = list(voicelist)
-else:
-    voicelist = list(voices())
-
-if args.list:
-    for i, v in enumerate(voicelist):
-        print(str(i) + ": " + v.name + " " + str(v.labels))
+if importlib.util.find_spec("elevenlabs") is None:
+    print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
    sys.exit()

-if args.voice:
-    voice = voicelist[args.voice % len(voicelist)]
-else:
-    voice = args.name
-    # if -n should consult -f, use the following
-    #voice = next(x for x in voicelist if x.name == args.name)
+from elevenlabs import generate, play, save

+# Get a Voice object, by name or UUID
+voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
+
+# Generate the TTS
 audio = generate(
-    text=str(args.inputfile.read()),
-    voice=voice
+  text=str(sys.argv[2:]),
+  voice=voice
 )
-if args.play:
-    play(audio)
-else:
-    save(audio, args.save) 
+
+# Save the TTS to a file
+save(audio, "audio.mp3") 
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -2,8 +2,12 @@
 #define LLAMA_H

 #include "ggml.h"
-#include "ggml-backend.h"
-
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@ -35,11 +39,15 @@

 #define LLAMA_MAX_RNG_STATE (64*1024)

-#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 5
+#define LLAMA_SESSION_VERSION 2
+
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif

 #ifdef __cplusplus
 extern "C" {
@ -59,19 +67,8 @@ extern "C" {
    typedef int32_t llama_seq_id;

    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
-    };
-
-    // note: these values should be synchronized with ggml_rope
-    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
-    enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM =  0,
-        LLAMA_ROPE_TYPE_NEOX =  2,
-        LLAMA_ROPE_TYPE_GLM  =  4,
+        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
    };

    enum llama_token_type {
@ -105,42 +102,16 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };

    enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
-        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
-    };
-
-    enum llama_pooling_type {
-        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
-        LLAMA_POOLING_TYPE_NONE = 0,
-        LLAMA_POOLING_TYPE_MEAN = 1,
-        LLAMA_POOLING_TYPE_CLS  = 2,
-    };
-
-    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };

    typedef struct llama_token_data {
@ -155,7 +126,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -165,7 +136,7 @@ extern "C" {
    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
    // - pos    : the positions of the respective token in the sequence
    // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+    // - logits : if zero, the logits for the respective token will not be output
    //
    typedef struct llama_batch {
        int32_t n_tokens;
@ -175,7 +146,7 @@ extern "C" {
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int8_t       *  logits;

        // NOTE: helpers for smooth API transition - can be deprecated in the future
        //       for future-proof code, use the above fields instead and ignore everything below
@ -187,46 +158,16 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

-    enum llama_model_kv_override_type {
-        LLAMA_KV_OVERRIDE_TYPE_INT,
-        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
-        LLAMA_KV_OVERRIDE_TYPE_BOOL,
-    };
-
-    struct llama_model_kv_override {
-        char key[128];
-        enum llama_model_kv_override_type tag;
-        union {
-            int64_t int_value;
-            double float_value;
-            bool bool_value;
-        };
-    };
-
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_LAYER: ignored
-        int32_t main_gpu;
-
-        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        const float * tensor_split;
-
-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
+        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
-
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

-        // override key-value pairs of the model meta data
-        const struct llama_model_kv_override * kv_overrides;
-
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@ -236,56 +177,35 @@ extern "C" {
    struct llama_context_params {
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
-        uint32_t n_ubatch;          // physical maximum batch size
-        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        uint32_t n_batch;           // prompt processing maximum batch size
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
-
-        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
-        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-                                                        // (ignored if no pooling layer)
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
        float    yarn_attn_factor; // YaRN magnitude scaling factor
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
-
-        ggml_backend_sched_eval_callback cb_eval;
-        void * cb_eval_user_data;
-
-        enum ggml_type type_k; // data type for K cache
-        enum ggml_type type_v; // data type for V cache

        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-
-        // Abort callback
-        // if it returns true, execution of llama_decode() will be aborted
-        // currently works only with CPU execution
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
+        bool logits_all; // the llama_eval() call computes all logits, not just the last one
+        bool embedding;  // embedding mode only
    };

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // itoken embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;      // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
    } llama_model_quantize_params;

    // grammar types
@ -336,12 +256,6 @@ extern "C" {
        int32_t n_eval;
    };

-    // used in chat template
-    typedef struct llama_chat_message {
-        const char * role;
-        const char * content;
-    } llama_chat_message;
-
    // Helpers for getting default parameters
    LLAMA_API struct llama_model_params llama_model_default_params(void);
    LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -350,10 +264,7 @@ extern "C" {
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
-    LLAMA_API void llama_backend_init(void);
-
-    //optional:
-    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
+    LLAMA_API void llama_backend_init(bool numa);

    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
@ -373,49 +284,25 @@ extern "C" {

    LLAMA_API int64_t llama_time_us(void);

-    LLAMA_API size_t llama_max_devices(void);
-
-    LLAMA_API bool llama_supports_mmap       (void);
-    LLAMA_API bool llama_supports_mlock      (void);
-    LLAMA_API bool llama_supports_gpu_offload(void);
+    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API bool llama_mmap_supported (void);
+    LLAMA_API bool llama_mlock_supported(void);

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

-    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
-    LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);

-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
+    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int llama_n_embd     (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

-    // Functions to access the model's GGUF metadata scalar values
-    // - The functions return the length of the string on success, or -1 on failure
-    // - The output string is always null-terminated and cleared on failure
-    // - GGUF array values are not supported by these functions
-
-    // Get metadata value as a string by key name
-    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
-
-    // Get the number of metadata key/value pairs
-    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
-
-    // Get metadata key name by index
-    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get metadata value as a string by index
-    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
    // Get a string describing the model type
-    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@ -427,7 +314,7 @@ extern "C" {
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);

    // Returns 0 on success
-    LLAMA_API uint32_t llama_model_quantize(
+    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
@ -438,85 +325,28 @@ extern "C" {
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
-    LLAMA_API int32_t llama_model_apply_lora_from_file(
-            const struct llama_model * model,
-                          const char * path_lora,
-                               float   scale,
-                          const char * path_base_model,
-                             int32_t   n_threads);
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                           float   scale,
+                      const char * path_base_model,
+                             int   n_threads),
+            "use llama_model_apply_lora_from_file instead");

-    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
-    // the currently loaded vector.
-    // n_embd should be the size of a single layer's control, and data should point
-    // to an n_embd x n_layers buffer starting from layer 1.
-    // il_start and il_end are the layer range the vector should apply to (both inclusive)
-    // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
-                     const float * data,
-                          size_t   len,
-                         int32_t   n_embd,
-                         int32_t   il_start,
-                         int32_t   il_end);
+    LLAMA_API int llama_model_apply_lora_from_file(
+            const struct llama_model * model,
+                      const char * path_lora,
+                           float   scale,
+                      const char * path_base_model,
+                             int   n_threads);

    //
    // KV cache
    //

-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_seq_max;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_seq_max items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    // Returns the number of tokens in the KV cache (slow, use only for debug)
-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    // Returns the number of tokens in the KV cache
+    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "avoid using this, it will be removed in the future, instead - count the tokens in user code");

    // Clear the KV cache
    LLAMA_API void llama_kv_cache_clear(
@ -526,7 +356,7 @@ extern "C" {
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API void llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -549,45 +379,16 @@ extern "C" {
                    llama_seq_id   seq_id);

    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    // If the KV cache is RoPEd, the KV data is updated accordingly
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_cache_seq_shift(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                       llama_pos   delta);

-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
-
-    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Defragment the KV cache
-    // This will be applied:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-
    //
    // State / sessions
    //
@ -607,7 +408,7 @@ extern "C" {
    // Returns the number of bytes read
    LLAMA_API size_t llama_set_state_data(
            struct llama_context * ctx,
-                   const uint8_t * src);
+                         uint8_t * src);

    // Save/load session file
    LLAMA_API bool llama_load_session_file(
@ -627,6 +428,27 @@ extern "C" {
    // Decoding
    //

+    // Run the llama inference to obtain the logits and probabilities for the next token(s).
+    // tokens + n_tokens is the provided batch of new tokens to process
+    // n_past is the number of tokens to use from previous eval calls
+    // Returns 0 on success
+    // DEPRECATED: use llama_decode() instead
+    LLAMA_API DEPRECATED(int llama_eval(
+            struct llama_context * ctx,
+                     llama_token * tokens,
+                         int32_t   n_tokens,
+                             int   n_past),
+            "use llama_decode() instead");
+
+    // Same as llama_eval, but use float matrix input directly.
+    // DEPRECATED: use llama_decode() instead
+    LLAMA_API DEPRECATED(int llama_eval_embd(
+            struct llama_context * ctx,
+                           float * embd,
+                         int32_t   n_tokens,
+                             int   n_past),
+            "use llama_decode() instead");
+
    // Return batch for single sequence of tokens starting at pos_0
    //
    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@ -656,7 +478,7 @@ extern "C" {
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
    // < 0 - error
-    LLAMA_API int32_t llama_decode(
+    LLAMA_API int llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);

@ -665,49 +487,21 @@ extern "C" {
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

-    // Set whether to use causal attention or not
-    // If set to true, the model will only attend to the past tokens
-    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
-
-    // Set abort callback
-    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Wait until all computations are finished
-    // This is automatically done when using one of the functions below to obtain the computation results
-    // and is not necessary to call it explicitly in most cases
-    LLAMA_API void llama_synchronize(struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_decode()
-    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
-    // in the order they have appeared in the batch.
-    // Rows: number of tokens for which llama_batch.logits[i] != 0
+    // Token logits obtained from the last call to llama_eval()
+    // The logits for the last token are stored in the last row
+    // Logits for which llama_batch.logits[i] == 0 are undefined
+    // Rows: n_tokens provided with llama_batch
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);

    // Logits for the ith token. Equivalent to:
-    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    // returns NULL for invalid ids.
+    // llama_get_logits(ctx) + i*n_vocab
    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);

-    // Get all output token embeddings.
-    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
-    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
-    // in the order they have appeared in the batch.
-    // shape: [n_outputs*n_embd]
-    // Otherwise, returns NULL.
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

-    // Get the embeddings for the ith token. Equivalent to:
-    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    // shape: [n_embd] (1-dimensional)
-    // returns NULL for invalid ids.
-    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the embeddings for a sequence id
-    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
-
    //
    // Vocab
    //
@ -723,12 +517,6 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
-
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
-
    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ -741,16 +529,16 @@ extern "C" {

    /// @details Convert the provided text into tokens.
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    /// @return Returns the number of tokens on success, no more than n_tokens_max
+    /// @return Returns the number of tokens on success, no more than n_max_tokens
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
    ///                Does not insert a leading space.
-    LLAMA_API int32_t llama_tokenize(
+    LLAMA_API int llama_tokenize(
        const struct llama_model * model,
                      const char * text,
-                         int32_t   text_len,
+                             int   text_len,
                     llama_token * tokens,
-                         int32_t   n_tokens_max,
+                             int   n_max_tokens,
                            bool   add_bos,
                            bool   special);

@ -758,30 +546,11 @@ extern "C" {
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int32_t llama_token_to_piece(
+    LLAMA_API int llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
-                               int32_t   length);
-
-    /// Apply chat template. Inspired by hf apply_chat_template() on python.
-    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
-    /// @param chat Pointer to a list of multiple llama_chat_message
-    /// @param n_msg Number of llama_chat_message in this chat
-    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
-    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
-    /// @param length The size of the allocated buffer
-    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
-    LLAMA_API int32_t llama_chat_apply_template(
-              const struct llama_model * model,
-                            const char * tmpl,
-       const struct llama_chat_message * chat,
-                                size_t   n_msg,
-                                  bool   add_ass,
-                                  char * buf,
-                               int32_t   length);
+                                  int    length);

    //
    // Grammar
@ -815,13 +584,13 @@ extern "C" {
                           float   penalty_present);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param logits Logits extracted from the original generation context.
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_apply_guidance(
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_classifier_free_guidance(
              struct llama_context * ctx,
-                             float * logits,
-                             float * logits_guidance,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
                             float   scale);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@ -833,7 +602,7 @@ extern "C" {
    LLAMA_API void llama_sample_top_k(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
-                         int32_t   k,
+                             int   k,
                          size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@ -864,19 +633,17 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

-    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates_p,
-                           float   min_temp,
-                           float   max_temp,
-                           float   exponent_val);
-
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   temp);

+    LLAMA_API DEPRECATED(void llama_sample_temperature(
+                struct llama_context * ctx,
+              llama_token_data_array * candidates,
+                               float   temp),
+            "use llama_sample_temp instead");
+
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(
            struct llama_context * ctx,
@ -894,7 +661,7 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   tau,
                           float   eta,
-                         int32_t   m,
+                             int   m,
                           float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@ -967,18 +734,8 @@ extern "C" {
        llama_beam_search_callback_fn_t   callback,
                                   void * callback_data,
                                 size_t   n_beams,
-                                int32_t   n_past,
-                                int32_t   n_predict);
-
-    /// @details Build a split GGUF final path for this chunk.
-    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
-    //  Returns the split_path length.
-    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
-
-    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-    //  Returns the split_prefix length.
-    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+                                    int   n_past,
+                                    int   n_predict);

    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
@ -1007,38 +764,10 @@ extern "C" {

 struct ggml_tensor;

-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar {
-    const std::vector<std::vector<llama_grammar_element>>   rules;
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8                                      partial_utf8;
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
-
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );

-std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const uint32_t                                                  chr);
-
-std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const std::string & src,
-        llama_partial_utf8   partial_start);
-
 #endif // LLAMA_API_INTERNAL

 #endif // LLAMA_H
--- a/examples/talk-llama/speak
+++ b/examples/talk-llama/speak
@ -1,40 +1,24 @@
 #!/bin/bash

 # Usage:
-#  speak <voice_id> <textfile>
+#  speak.sh <voice_id> <text-to-speak>

-function installed() { command -v $1 >/dev/null 2>&1; }
-
-if installed espeak; then
-  espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
-
-elif installed piper && installed aplay; then
-  cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
+# espeak
+# Mac OS: brew install espeak
+# Linux: apt-get install espeak
+#
+#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"

 # for Mac
-elif installed say; then
-  say -f $2
+say "$2"

 # Eleven Labs
-elif installed python3 && \
-  python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
-  installed ffplay; then
-    # It's possible to use the API for free with limited number of characters.
-    # To increase this limit register to https://beta.elevenlabs.io to get an api key
-    # and paste it after 'ELEVEN_API_KEY='
-    # Keep the line commented to use the free version without api key
-    #export ELEVEN_API_KEY=your_api_key
-    wd=$(dirname $0)
-    script=$wd/eleven-labs.py
-    python3 $script -q -p -v $1 $2 >/dev/null 2>&1
-
-    # Uncomment to keep the audio file
-    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
-    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
-
-else
-  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
-  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
-  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
-  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
-fi
+# To use it, install the elevenlabs module from pip (pip install elevenlabs)
+# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
+#Keep the line commented to use the free version whitout api key
+#
+#export ELEVEN_API_KEY=your_api_key
+#wd=$(dirname $0)
+#script=$wd/eleven-labs.py
+#python3 $script $1 "$2" >/dev/null 2>&1
+#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk-llama/speak.bat
+++ b/examples/talk-llama/speak.bat
@ -1 +1 @@
-@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2
+@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
--- a/examples/talk-llama/speak.ps1
+++ b/examples/talk-llama/speak.ps1
@ -1,14 +1,12 @@
 # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
 param(
-  [Parameter(Mandatory=$true)][int]$voicenum,
-  [Parameter(Mandatory=$true)][string]$textfile
+  # voice options are David or Zira
+  [Parameter(Mandatory=$true)][string]$voice,
+  [Parameter(Mandatory=$true)][string]$text
 )

 Add-Type -AssemblyName System.Speech;
 $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
-$voiceoptions = $speak.GetInstalledVoices("en-US");
-$voice = $voiceoptions[$voicenum % $voiceoptions.count];
-$speak.SelectVoice($voice.VoiceInfo.Name);
+$speak.SelectVoice("Microsoft $voice Desktop");
 $speak.Rate="0";
-$text = Get-Content -Path $textfile;
 $speak.Speak($text);
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -14,7 +14,6 @@
 #include <thread>
 #include <vector>
 #include <regex>
-#include <sstream>

 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);
@ -54,7 +53,6 @@ struct whisper_params {
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
-    int32_t n_gpu_layers = 999;

    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;
@ -68,14 +66,10 @@ struct whisper_params {
    bool use_gpu        = true;

    std::string person      = "Georgi";
-    std::string bot_name    = "LLaMA";
-    std::string wake_cmd    = "";
-    std::string heard_ok    = "";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
    std::string speak       = "./examples/talk-llama/speak";
-    std::string speak_file  = "./examples/talk-llama/to_speak.txt";
    std::string prompt      = "";
    std::string fname_out;
    std::string path_session = "";       // path to file for saving/loading model eval state
@ -96,7 +90,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
-        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
@ -106,15 +99,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
-        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
-        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
-        else if (arg == "-w"   || arg == "--wake-command")   { params.wake_cmd       = argv[++i]; }
-        else if (arg == "-ho"  || arg == "--heard-ok")       { params.heard_ok       = argv[++i]; }
+        else if (arg == "--session")                         { params.path_session   = argv[++i];}
        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
-        else if (arg == "-sf"  || arg == "--speak-file")     { params.speak_file     = argv[++i]; }
        else if (arg == "--prompt-file")                     {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
@ -145,7 +134,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
@ -155,14 +143,10 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
-    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
-    fprintf(stderr, "  -ho TEXT, --heard-ok TEXT  [%-7s] said by TTS before generating reply\n",         params.heard_ok.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -sf FILE, --speak-file     [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
@ -237,18 +221,6 @@ std::string transcribe(
    return result;
 }

-std::vector<std::string> get_words(const std::string &txt) {
-    std::vector<std::string> words;
-
-    std::istringstream iss(txt);
-    std::string word;
-    while (iss >> word) {
-        words.push_back(word);
-    }
-
-    return words;
-}
-
 const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";

 const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
@ -284,20 +256,18 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);

    // llama init

-    llama_backend_init();
+    llama_backend_init(true);

    auto lmparams = llama_model_default_params();
    if (!params.use_gpu) {
        lmparams.n_gpu_layers = 0;
-    } else {
-        lmparams.n_gpu_layers = params.n_gpu_layers;
    }

    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
@ -307,6 +277,7 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
+    lcparams.f16_kv     = true;
    lcparams.n_threads  = params.n_threads;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
@ -348,11 +319,12 @@ int main(int argc, char ** argv) {
    float prob0 = 0.0f;

    const std::string chat_symb = ":";
+    const std::string bot_name  = "LLaMA";

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", params.bot_name);
+    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);

    // construct the initial prompt for LLaMA inference
    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
@ -361,7 +333,7 @@ int main(int argc, char ** argv) {
    prompt_llama.insert(0, 1, ' ');

    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
-    prompt_llama = ::replace(prompt_llama, "{1}", params.bot_name);
+    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);

    {
        // get time string
@ -391,8 +363,6 @@ int main(int argc, char ** argv) {

    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);

-    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
-
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@ -428,21 +398,8 @@ int main(int argc, char ** argv) {
    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);

-    // prepare batch
-    {
-        batch.n_tokens = embd_inp.size();
-
-        for (int i = 0; i < batch.n_tokens; i++) {
-            batch.token[i]     = embd_inp[i];
-            batch.pos[i]       = i;
-            batch.n_seq_id[i]  = 1;
-            batch.seq_id[i][0] = 0;
-            batch.logits[i]    = i == batch.n_tokens - 1;
-        }
-    }
-
-    if (llama_decode(ctx_llama, batch)) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0)) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
        return 1;
    }

@ -478,16 +435,6 @@ int main(int argc, char ** argv) {
    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

    printf("%s : done! start speaking in the microphone\n", __func__);
-
-    // show wake command if enabled
-    const std::string wake_cmd = params.wake_cmd;
-    const int wake_cmd_length = get_words(wake_cmd).size();
-    const bool use_wake_cmd = wake_cmd_length > 0;
-
-    if (use_wake_cmd) {
-        printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
-    }
-
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);
@ -533,38 +480,10 @@ int main(int argc, char ** argv) {

                audio.get(params.voice_ms, pcmf32_cur);

-                std::string all_heard;
-
-                if (!force_speak) {
-                    all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
-                }
-
-                const auto words = get_words(all_heard);
-
-                std::string wake_cmd_heard;
                std::string text_heard;

-                for (int i = 0; i < (int) words.size(); ++i) {
-                    if (i < wake_cmd_length) {
-                        wake_cmd_heard += words[i] + " ";
-                    } else {
-                        text_heard += words[i] + " ";
-                    }
-                }
-
-                // check if audio starts with the wake-up command if enabled
-                if (use_wake_cmd) {
-                    const float sim = similarity(wake_cmd_heard, wake_cmd);
-
-                    if ((sim < 0.7f) || (text_heard.empty())) {
-                        audio.clear();
-                        continue;
-                    }
-                }
-
-                // optionally give audio feedback that the current text is being processed
-                if (!params.heard_ok.empty()) {
-                    speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
+                if (!force_speak) {
+                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
                }

                // remove text between brackets using regex
@ -601,7 +520,7 @@ int main(int argc, char ** argv) {
                force_speak = false;

                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + params.bot_name + chat_symb;
+                text_heard += "\n" + bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);

@ -662,21 +581,8 @@ int main(int argc, char ** argv) {
                            n_session_consumed = session_tokens.size();
                        }

-                        // prepare batch
-                        {
-                            batch.n_tokens = embd.size();
-
-                            for (int i = 0; i < batch.n_tokens; i++) {
-                                batch.token[i]     = embd[i];
-                                batch.pos[i]       = n_past + i;
-                                batch.n_seq_id[i]  = 1;
-                                batch.seq_id[i][0] = 0;
-                                batch.logits[i]    = i == batch.n_tokens - 1;
-                            }
-                        }
-
-                        if (llama_decode(ctx_llama, batch)) {
-                            fprintf(stderr, "%s : failed to decode\n", __func__);
+                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past)) {
+                            fprintf(stderr, "%s : failed to eval\n", __func__);
                            return 1;
                        }
                    }
@ -747,7 +653,6 @@ int main(int argc, char ** argv) {
                            text_to_speak += llama_token_to_piece(ctx_llama, id);

                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
-                            fflush(stdout);
                        }
                    }

@ -776,7 +681,11 @@ int main(int argc, char ** argv) {
                    }
                }

-                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
+                text_to_speak = ::replace(text_to_speak, "\"", "");
+                int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                if (ret != 0) {
+                    fprintf(stderr, "%s: failed to speak\n", __func__);
+                }

                audio.clear();
            }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@ -1,16 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <utility>
-#include <vector>
-
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
-extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
-extern const std::map<char32_t, char32_t> unicode_map_lowercase;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@ -1,277 +0,0 @@
-#include "unicode.h"
-#include "unicode-data.h"
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
-    std::string result;
-    for (size_t i = 0; i < cps.size(); ++i) {
-        result.append(unicode_cpt_to_utf8(cps[i]));
-    }
-    return result;
-}
-
-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
-    assert(offset < utf8.size());
-    if (!(utf8[offset + 0] & 0x80)) {
-        auto result = utf8[offset + 0];
-        offset += 1;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x40)) {
-        throw std::invalid_argument("invalid character");
-    }
-    if (!(utf8[offset + 0] & 0x20)) {
-        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
-        offset += 2;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x10)) {
-        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
-        offset += 3;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x08)) {
-        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
-        offset += 4;
-        return result;
-    }
-    throw std::invalid_argument("invalid string");
-}
-
-static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
-    std::vector<uint16_t> result;
-    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-        result.emplace_back(cp);
-    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
-    }
-    else {
-        throw std::invalid_argument("invalid cpt");
-    }
-    return result;
-}
-
-//static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
-//    std::vector<uint16_t> result;
-//    for (size_t i = 0; i < cps.size(); ++i) {
-//        auto temp = unicode_cpt_to_utf16(cps[i]);
-//        result.insert(result.end(), temp.begin(), temp.end());
-//    }
-//    return result;
-//}
-
-static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
-    assert(offset < utf16.size());
-    if (((utf16[0] >> 10) << 10) != 0xd800) {
-        auto result = utf16[offset + 0];
-        offset += 1;
-        return result;
-    }
-
-    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
-        throw std::invalid_argument("invalid character");
-    }
-
-    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-    offset += 2;
-    return result;
-}
-
-//static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
-//    std::vector<uint32_t> result;
-//    size_t offset = 0;
-//    while (offset < utf16.size()) {
-//        result.push_back(cpt_from_utf16(utf16, offset));
-//    }
-//    return result;
-//}
-
-static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
-    std::unordered_map<uint32_t, int> cpt_types;
-    for (auto p : unicode_ranges_digit) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_DIGIT;
-        }
-    }
-    for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_LETTER;
-        }
-    }
-    for (auto p : unicode_ranges_whitespace) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
-        }
-    }
-    for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
-        }
-    }
-    for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
-        }
-    }
-    for  (auto p : unicode_ranges_symbol) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
-        }
-    }
-    for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_CONTROL;
-        }
-    }
-    return cpt_types;
-}
-
-static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
-    std::unordered_map<uint8_t, std::string> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = unicode_cpt_to_utf8(ch);
-    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = unicode_cpt_to_utf8(ch);
-    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = unicode_cpt_to_utf8(ch);
-    }
-    auto n = 0;
-    for (int ch = 0; ch < 256; ++ch) {
-        if (map.find(ch) == map.end()) {
-            map[ch] = unicode_cpt_to_utf8(256 + n);
-            ++n;
-        }
-    }
-    return map;
-}
-
-static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
-    std::unordered_map<std::string, uint8_t> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[unicode_cpt_to_utf8(ch)] = ch;
-    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[unicode_cpt_to_utf8(ch)] = ch;
-    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[unicode_cpt_to_utf8(ch)] = ch;
-    }
-    auto n = 0;
-    for (int ch = 0; ch < 256; ++ch) {
-        if (map.find(unicode_cpt_to_utf8(ch)) == map.end()) {
-            map[unicode_cpt_to_utf8(256 + n)] = ch;
-            ++n;
-        }
-    }
-    return map;
-}
-
-//
-// interface
-//
-
-std::string unicode_cpt_to_utf8(uint32_t cp) {
-    std::string result;
-    if (/* 0x00 <= cp && */ cp <= 0x7f) {
-        result.push_back(cp);
-    }
-    else if (0x80 <= cp && cp <= 0x7ff) {
-        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else if (0x800 <= cp && cp <= 0xffff) {
-        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.push_back(0xf0 | ((cp >> 18) & 0x07));
-        result.push_back(0x80 | ((cp >> 12) & 0x3f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else {
-        throw std::invalid_argument("invalid codepoint");
-    }
-    return result;
-}
-
-std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
-    std::vector<uint32_t> result;
-    result.reserve(cpts.size());
-    for (size_t i = 0; i < cpts.size(); ++i) {
-        auto it = unicode_map_nfd.find(cpts[i]);
-        if (it == unicode_map_nfd.end()) {
-            result.push_back(cpts[i]);
-        } else {
-            result.push_back(it->second);
-        }
-    }
-    return result;
-}
-
-std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
-    std::vector<uint32_t> result;
-    size_t offset = 0;
-    while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
-    }
-    return result;
-}
-
-int unicode_cpt_type(uint32_t cp) {
-    static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
-    const auto it = cpt_types.find(cp);
-    return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
-}
-
-int unicode_cpt_type(const std::string & utf8) {
-    if (utf8.length() == 0) {
-        return CODEPOINT_TYPE_UNIDENTIFIED;
-    }
-    size_t offset = 0;
-    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
-}
-
-std::string unicode_byte_to_utf8(uint8_t byte) {
-    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
-    return map.at(byte);
-}
-
-uint8_t unicode_utf8_to_byte(const std::string & utf8) {
-    static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
-    return map.at(utf8);
-}
-
-char32_t unicode_tolower(char32_t cp) {
-    auto it = unicode_map_lowercase.find(cp);
-    return it == unicode_map_lowercase.end() ? cp : it->second;
-}
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -1,28 +1,462 @@
-#pragma once
+#pragma once

-#include <cstdint>
-#include <string>
+#include <cassert>
+#include <stdexcept>
 #include <vector>
+#include <unordered_map>
+
+static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
+{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
+{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
+{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
+{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
+{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
+{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
+{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
+{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
+{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
+{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
+{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
+{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
+{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
+{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
+{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
+{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
+{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
+{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
+{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
+{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
+{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
+{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
+{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
+{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
+{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
+{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
+{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
+{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
+{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
+{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
+{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
+{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
+{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
+{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
+{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
+{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
+{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
+{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
+{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
+{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
+{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
+{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
+{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
+{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
+{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
+{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
+{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
+{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
+{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
+{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
+{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
+{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
+{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
+{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
+{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
+{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
+{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
+{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
+{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
+{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
+{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
+{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
+{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
+{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
+{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
+{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
+{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
+{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
+{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
+{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
+{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
+{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
+{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
+{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
+{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
+{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
+{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
+{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
+{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
+{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
+{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
+{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
+{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
+{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
+{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
+{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
+{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
+{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
+{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
+{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
+{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
+{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
+{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
+{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
+{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
+{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
+{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
+{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
+{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
+{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
+{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
+{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
+{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
+{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
+{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
+{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
+{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
+{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
+{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
+{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
+{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
+{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
+{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
+{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
+{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
+{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
+{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
+{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
+{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
+{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
+{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
+{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
+{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
+{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
+{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
+{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
+{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
+{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
+{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
+{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
+{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
+};
+
+static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
+{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
+{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
+{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
+{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
+{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
+{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
+{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
+{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
+{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
+{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
+{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
+{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
+{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
+{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
+{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
+{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
+{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
+{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
+{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
+{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
+{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
+{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
+{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
+{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
+{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
+{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
+{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
+{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
+{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
+{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
+{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
+{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
+{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
+{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
+{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
+{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
+{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
+{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
+{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
+{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
+{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
+{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
+{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
+{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
+{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
+{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
+{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
+{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
+{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
+{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
+{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
+{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
+{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
+{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
+{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
+{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
+{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
+{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
+{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
+{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
+{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
+{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
+{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
+{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
+};
+
+static std::string codepoint_to_utf8(uint32_t cp) {
+    std::string result;
+    if (/* 0x00 <= cp && */ cp <= 0x7f) {
+        result.push_back(cp);
+    }
+    else if (0x80 <= cp && cp <= 0x7ff) {
+        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
+        result.push_back(0x80 | (cp & 0x3f));
+    }
+    else if (0x800 <= cp && cp <= 0xffff) {
+        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
+        result.push_back(0x80 | ((cp >> 6) & 0x3f));
+        result.push_back(0x80 | (cp & 0x3f));
+    }
+    else if (0x10000 <= cp && cp <= 0x10ffff) {
+        result.push_back(0xf0 | ((cp >> 18) & 0x07));
+        result.push_back(0x80 | ((cp >> 12) & 0x3f));
+        result.push_back(0x80 | ((cp >> 6) & 0x3f));
+        result.push_back(0x80 | (cp & 0x3f));
+    }
+    else {
+        throw std::invalid_argument("invalid codepoint");
+    }
+    return result;
+}
+
+static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
+    std::string result;
+    for (size_t i = 0; i < cps.size(); ++i) {
+        result.append(codepoint_to_utf8(cps[i]));
+    }
+    return result;
+}
+
+static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
+    assert(offset < utf8.size());
+    if (!(utf8[offset + 0] & 0x80)) {
+        auto result = utf8[offset + 0];
+        offset += 1;
+        return result;
+    }
+    else if (!(utf8[offset + 0] & 0x40)) {
+        throw std::invalid_argument("invalid character");
+    }
+    else if (!(utf8[offset + 0] & 0x20)) {
+        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
+            throw std::invalid_argument("invalid character");
+        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
+        offset += 2;
+        return result;
+    }
+    else if (!(utf8[offset + 0] & 0x10)) {
+        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
+            throw std::invalid_argument("invalid character");
+        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
+        offset += 3;
+        return result;
+    }
+    else if (!(utf8[offset + 0] & 0x08)) {
+        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
+            throw std::invalid_argument("invalid character");
+        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
+        offset += 4;
+        return result;
+    }
+    throw std::invalid_argument("invalid string");
+}
+
+static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
+    std::vector<uint32_t> result;
+    size_t offset = 0;
+    while (offset < utf8.size()) {
+        result.push_back(codepoint_from_utf8(utf8, offset));
+    }
+    return result;
+}
+
+static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
+    std::vector<uint16_t> result;
+    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
+        result.emplace_back(cp);
+    }
+    else if (0x10000 <= cp && cp <= 0x10ffff) {
+        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
+        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+    }
+    else {
+        throw std::invalid_argument("invalid codepoint");
+    }
+    return result;
+}
+
+static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
+    std::vector<uint16_t> result;
+    for (size_t i = 0; i < cps.size(); ++i) {
+        auto temp = codepoint_to_utf16(cps[i]);
+        result.insert(result.end(), temp.begin(), temp.end());
+    }
+    return result;
+}
+
+static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
+    assert(offset < utf16.size());
+    if (((utf16[0] >> 10) << 10) != 0xd800) {
+        auto result = utf16[offset + 0];
+        offset += 1;
+        return result;
+    }
+    else {
+        if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
+            throw std::invalid_argument("invalid character");
+        auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+        offset += 2;
+        return result;
+    }
+    throw std::invalid_argument("invalid string");
+}
+
+static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
+    std::vector<uint32_t> result;
+    size_t offset = 0;
+    while (offset < utf16.size())
+        result.push_back(codepoint_from_utf16(utf16, offset));
+    return result;
+}

 #define CODEPOINT_TYPE_UNIDENTIFIED 0
-#define CODEPOINT_TYPE_DIGIT        1
-#define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_WHITESPACE   3
-#define CODEPOINT_TYPE_ACCENT_MARK  4
-#define CODEPOINT_TYPE_PUNCTUATION  5
-#define CODEPOINT_TYPE_SYMBOL       6
-#define CODEPOINT_TYPE_CONTROL      7
+#define CODEPOINT_TYPE_DIGIT 1
+#define CODEPOINT_TYPE_LETTER 2
+#define CODEPOINT_TYPE_WHITESPACE 3
+#define CODEPOINT_TYPE_ACCENT_MARK 4
+#define CODEPOINT_TYPE_PUNCTUATION 5
+#define CODEPOINT_TYPE_SYMBOL 6
+#define CODEPOINT_TYPE_CONTROL 7

-std::string unicode_cpt_to_utf8(uint32_t cp);
-std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
+static std::unordered_map<uint32_t, int> codepoint_type_map() {
+    std::unordered_map<uint32_t, int> codepoint_types;
+    for (auto p : digit_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
+    }
+    for(auto p : letter_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_LETTER;
+    }
+    for(auto p : whitespace_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
+    }
+    for(auto p : accent_mark_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
+    }
+    for(auto p : punctuation_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
+    }
+    for (auto p : symbol_ranges) {
+        for (auto i = p.first; i <= p.second; ++i)
+            codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
+    }
+    for(auto p : control_ranges) {
+        for(auto i = p.first; i <= p.second; ++ i)
+            codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
+    }
+    return codepoint_types;
+}

-std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
+static int codepoint_type(uint32_t cp) {
+    static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
+    return codepoint_types[cp];
+}

-int unicode_cpt_type(uint32_t cp);
-int unicode_cpt_type(const std::string & utf8);
+static int codepoint_type(const std::string & utf8) {
+    if (utf8.length() == 0)
+        return CODEPOINT_TYPE_UNIDENTIFIED;
+    size_t offset = 0;
+    return codepoint_type(codepoint_from_utf8(utf8, offset));
+}

-std::string unicode_byte_to_utf8(uint8_t byte);
-uint8_t unicode_utf8_to_byte(const std::string & utf8);
+static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
+    std::unordered_map<uint8_t, std::string> map;
+    for (int ch = u'!'; ch <= u'~'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[ch] = codepoint_to_utf8(ch);
+    }
+    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[ch] = codepoint_to_utf8(ch);
+    }
+    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[ch] = codepoint_to_utf8(ch);
+    }
+    auto n = 0;
+    for (int ch = 0; ch < 256; ++ch) {
+        if (map.find(ch) == map.end()) {
+            map[ch] = codepoint_to_utf8(256 + n);
+            ++n;
+        }
+    }
+    return map;
+}
+
+static std::string bytes_to_unicode_bpe(uint8_t byte) {
+    static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
+    return map.at(byte);
+}
+
+static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
+    std::unordered_map<std::string, uint8_t> map;
+    for (int ch = u'!'; ch <= u'~'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[codepoint_to_utf8(ch)] = ch;
+    }
+    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[codepoint_to_utf8(ch)] = ch;
+    }
+    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
+        assert(0 <= ch && ch < 256);
+        map[codepoint_to_utf8(ch)] = ch;
+    }
+    auto n = 0;
+    for (int ch = 0; ch < 256; ++ch) {
+        if (map.find(codepoint_to_utf8(ch)) == map.end()) {
+            map[codepoint_to_utf8(256 + n)] = ch;
+            ++n;
+        }
+    }
+    return map;
+}
+
+static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
+    static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
+    return map.at(utf8);
+}

-// simple tolower that only implements one-to-one mapping, not one-to-many
-char32_t unicode_tolower(char32_t cp);
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -29,6 +29,18 @@ std::string g_status_forced = "";

 std::vector<float> g_pcmf32;

+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
 void talk_set_status(const std::string & status) {
    std::lock_guard<std::mutex> lock(g_mutex);
    g_status = status;
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -524,7 +524,8 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/.gitignore
+++ b/examples/talk/.gitignore
@ -1,2 +1 @@
 audio.mp3
-to_speak.txt
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -11,13 +11,9 @@ Web version: [examples/talk.wasm](/examples/talk.wasm)
 The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

 ```bash
-# Install SDL2
-# On Debian based linux distributions:
+# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev

-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
 # Install SDL2 on Mac OS
 brew install sdl2

--- a/examples/talk/eleven-labs.py
+++ b/examples/talk/eleven-labs.py
@ -1,80 +1,20 @@
 import sys
-import argparse
-import textwrap
+import importlib.util

-parser = argparse.ArgumentParser(add_help=False,
-    formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-q", "--quick", action="store_true",
-    help="skip checking the required library")
-
-modes = parser.add_argument_group("action")
-modes.add_argument("inputfile", metavar="TEXTFILE",
-    nargs='?', type=argparse.FileType(), default=sys.stdin,
-    help="read the text file (default: stdin)")
-modes.add_argument("-l", "--list", action="store_true",
-    help="show the list of voices and exit")
-modes.add_argument("-h", "--help", action="help",
-    help="show this help and exit")
-
-selopts = parser.add_argument_group("voice selection")
-selmodes = selopts.add_mutually_exclusive_group()
-selmodes.add_argument("-n", "--name",
-    default="Arnold",
-    help="get a voice object by name (default: Arnold)")
-selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
-    help="get a voice object by number (see --list)")
-selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
-    default=["use case=narration"],
-    help=textwrap.dedent('''\
-        filter voices by labels (default: "use case=narration")
-        this option can be used multiple times
-        filtering will be disabled if the first -f has no "=" (e.g. -f "any")
-        '''))
-
-outmodes = parser.add_argument_group("output")
-outgroup = outmodes.add_mutually_exclusive_group()
-outgroup.add_argument("-s", "--save", metavar="FILE",
-    default="audio.mp3",
-    help="save the TTS to a file (default: audio.mp3)")
-outgroup.add_argument("-p", "--play", action="store_true",
-    help="play the TTS with ffplay")
-
-args = parser.parse_args()
-
-if not args.quick:
-    import importlib.util
-    if importlib.util.find_spec("elevenlabs") is None:
-        print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
-        sys.exit()
-
-from elevenlabs import voices, generate, play, save
-
-if args.filter and "=" in args.filter[0]:
-    voicelist = voices()
-    for f in args.filter:
-        label, value = f.split("=")
-        voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
-    voicelist = list(voicelist)
-else:
-    voicelist = list(voices())
-
-if args.list:
-    for i, v in enumerate(voicelist):
-        print(str(i) + ": " + v.name + " " + str(v.labels))
+if importlib.util.find_spec("elevenlabs") is None:
+    print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
    sys.exit()

-if args.voice:
-    voice = voicelist[args.voice % len(voicelist)]
-else:
-    voice = args.name
-    # if -n should consult -f, use the following
-    #voice = next(x for x in voicelist if x.name == args.name)
+from elevenlabs import generate, play, save

+# Get a Voice object, by name or UUID
+voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
+
+# Generate the TTS
 audio = generate(
-    text=str(args.inputfile.read()),
-    voice=voice
+  text=str(sys.argv[2:]),
+  voice=voice
 )
-if args.play:
-    play(audio)
-else:
-    save(audio, args.save) 
+
+# Save the TTS to a file
+save(audio, "audio.mp3") 
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -525,7 +525,8 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/speak
+++ b/examples/talk/speak
@ -1,40 +1,24 @@
 #!/bin/bash

 # Usage:
-#  speak <voice_id> <textfile>
+#  speak.sh <voice_id> <text-to-speak>

-function installed() { command -v $1 >/dev/null 2>&1; }
+# espeak
+# Mac OS: brew install espeak
+# Linux: apt-get install espeak
+#
+#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"

-if installed espeak; then
-  espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
-
-elif installed piper && installed aplay; then
-  cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
-
-# for Mac
-elif installed say; then
-  say -f $2
+# Mac OS "say" command
+say "$2"

 # Eleven Labs
-elif installed python3 && \
-  python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
-  installed ffplay; then
-    # It's possible to use the API for free with limited number of characters.
-    # To increase this limit register to https://beta.elevenlabs.io to get an api key
-    # and paste it after 'ELEVEN_API_KEY='
-    # Keep the line commented to use the free version without api key
-    #export ELEVEN_API_KEY=your_api_key
-    wd=$(dirname $0)
-    script=$wd/eleven-labs.py
-    python3 $script -q -p -v $1 $2 >/dev/null 2>&1
-
-    # Uncomment to keep the audio file
-    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
-    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
-
-else
-  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
-  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
-  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
-  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
-fi
+# To use it, install the elevenlabs module from pip (pip install elevenlabs)
+# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
+#Keep the line commented to use the free version without api key
+#
+#export ELEVEN_API_KEY=your_api_key
+#wd=$(dirname $0)
+#script=$wd/eleven-labs.py
+#python3 $script $1 "$2"
+#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
--- a/examples/talk/speak.ps1
+++ b/examples/talk/speak.ps1
@ -1,14 +1,12 @@
 # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
 param(
-  [Parameter(Mandatory=$true)][int]$voicenum,
-  [Parameter(Mandatory=$true)][string]$textfile
+  # voice options are David or Zira
+  [Parameter(Mandatory=$true)][string]$voice,
+  [Parameter(Mandatory=$true)][string]$text
 )

 Add-Type -AssemblyName System.Speech;
 $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
-$voiceoptions = $speak.GetInstalledVoices("en-US");
-$voice = $voiceoptions[$voicenum % $voiceoptions.count];
-$speak.SelectVoice($voice.VoiceInfo.Name);
+$speak.SelectVoice("Microsoft $voice Desktop");
 $speak.Rate="0";
-$text = Get-Content -Path $textfile;
 $speak.Speak($text);
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -38,7 +38,6 @@ struct whisper_params {
    std::string model_wsp = "models/ggml-base.en.bin";
    std::string model_gpt = "models/ggml-gpt-2-117M.bin";
    std::string speak     = "./examples/talk/speak";
-    std::string speak_file= "./examples/talk/to_speak.txt";
    std::string fname_out;
 };

@ -69,7 +68,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
        else if (arg == "-mg"  || arg == "--model-gpt")     { params.model_gpt     = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-sf"  || arg == "--speak_file")    { params.speak_file    = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -104,7 +102,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -mg FILE, --model-gpt     [%-7s] gpt model file\n",                              params.model_gpt.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -sf FILE, --speak_file    [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
@ -187,7 +184,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
@ -319,7 +316,7 @@ int main(int argc, char ** argv) {
                    std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);

                    text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
-                    //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+                    text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
                    text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));

                    // remove first 2 lines of base prompt
@ -357,7 +354,10 @@ int main(int argc, char ** argv) {
                gpt2_set_prompt(ctx_gpt, prompt_base.c_str());

                text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
-                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
+                int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                if (ret != 0) {
+                    fprintf(stderr, "%s: system() failed!\n", __func__);
+                }

                audio.clear();

--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/wchess/CMakeLists.txt
+++ b/examples/wchess/CMakeLists.txt
@ -1,10 +0,0 @@
-add_subdirectory(libwchess)
-set_target_properties(wchess-core PROPERTIES FOLDER "libs")
-
-if (EMSCRIPTEN)
-    add_subdirectory(wchess.wasm)
-    set_target_properties(wchess.wasm PROPERTIES FOLDER "libs")
-else()
-    add_subdirectory(wchess.cmd)
-    set_target_properties(wchess PROPERTIES FOLDER "libs")
-endif()
--- a/examples/wchess/README.md
+++ b/examples/wchess/README.md
@ -1,45 +0,0 @@
-# wchess
-
-Voice-controlled chess using Whisper
-
-Online demo: https://whisper.ggerganov.com/wchess/
-
-https://github.com/ggerganov/whisper.cpp/assets/1991296/c2b2f03c-9684-49f3-8106-357d2d4e67fa
-
-## Command-line tool
-
-```bash
-mkdir build && cd build
-cmake -DWHISPER_SDL2=1 ..
-make -j
-
-./bin/wchess -m ../models/ggml-base.en.bin
-
-Move: start
-
-a b c d e f g h
-r n b q k b n r 8
-p p p p p p p p 7
-. * . * . * . * 6
-* . * . * . * . 5
-. * . * . * . * 4
-* . * . * . * . 3
-P P P P P P P P 2
-R N B Q K B N R 1
-
-White's turn
-[(l)isten/(p)ause/(q)uit]: 
-```
-
-## TODO
-
- Fix bugs in the chess moves logic
- Improve web-browser audio capture - sometimes it does not record the voice properly
- Add support for more languages by making the generated grammar string multilingual
- Explore ways to improve the dynamic grammar to be narrower
-
-PRs welcome!
-
-## Thanks
-
- [chessboardjs](https://chessboardjs.com) for the neat chessboard JS library used in this demo
--- a/examples/wchess/libwchess/CMakeLists.txt
+++ b/examples/wchess/libwchess/CMakeLists.txt
@ -1,19 +0,0 @@
-add_library(wchess-core STATIC
-    WChess.cpp
-    WChess.h
-    Chessboard.cpp
-    Chessboard.h
-)
-
-target_link_libraries(wchess-core
-    PUBLIC
-    whisper
-    common
-)
-
-target_include_directories(wchess-core
-    PUBLIC
-    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-)
-
-# add_executable(test-chessboard test-chessboard.cpp Chessboard.cpp)
--- a/examples/wchess/libwchess/Chessboard.cpp
+++ b/examples/wchess/libwchess/Chessboard.cpp
@ -1,803 +0,0 @@
-#include "Chessboard.h"
-
-#include <array>
-#include <vector>
-#include <algorithm>
-#include <cstring>
-#include <set>
-#include <list>
-#include <chrono>
-
-namespace {
-constexpr std::array<const char*, 64> positions = {
-    "a1", "b1", "c1", "d1", "e1", "f1", "g1", "h1",
-    "a2", "b2", "c2", "d2", "e2", "f2", "g2", "h2",
-    "a3", "b3", "c3", "d3", "e3", "f3", "g3", "h3",
-    "a4", "b4", "c4", "d4", "e4", "f4", "g4", "h4",
-    "a5", "b5", "c5", "d5", "e5", "f5", "g5", "h5",
-    "a6", "b6", "c6", "d6", "e6", "f6", "g6", "h6",
-    "a7", "b7", "c7", "d7", "e7", "f7", "g7", "h7",
-    "a8", "b8", "c8", "d8", "e8", "f8", "g8", "h8",
-};
-constexpr char INVALID_POS = positions.size();
-constexpr int R = 0; // rank index
-constexpr int F = 1; // file index
-#define FILE (c[F] - '1')
-#define RANK (c[R] - 'a')
-constexpr char operator ""_P(const char * c, size_t size) {
-    return size < 2 || RANK < 0 || RANK > 7 ||
-        FILE < 0 || FILE > 7 ? INVALID_POS : FILE * 8 + RANK;
-}
-#undef FILE
-#undef RANK
-
-struct sview {
-    const char * ptr = nullptr;
-    size_t size = 0;
-
-    sview() = default;
-    sview(const char * p, size_t s) : ptr(p), size(s) {}
-    sview(const std::string& s) : ptr(s.data()), size(s.size()) {}
-
-    size_t find(char del, size_t pos) {
-        while (pos < size && ptr[pos] != del) ++pos;
-        return pos < size ? pos : std::string::npos;
-    }
-};
-
-std::vector<sview> split(sview str, char del) {
-    std::vector<sview> res;
-    size_t cur = 0;
-    size_t last = 0;
-    while (cur != std::string::npos) {
-        if (str.ptr[last] == ' ') {
-            ++last;
-            continue;
-        }
-        cur = str.find(del, last);
-        size_t len = cur == std::string::npos ? str.size - last : cur - last;
-        res.emplace_back(str.ptr + last, len);
-        last = cur + 1;
-    }
-    return res;
-}
-
-char strToPos(sview str) {
-    return operator ""_P(str.ptr, str.size);
-}
-
-constexpr std::array<const char*, 6> pieceNames =  {
-    "pawn", "knight", "bishop", "rook", "queen", "king",
-};
-
-static constexpr std::array<char, 6> blackShort =  {
-    'p', 'n', 'b', 'r', 'q', 'k',
-};
-static constexpr std::array<char, 6> whiteShort =  {
-    'P', 'N', 'B', 'R', 'Q', 'K',
-};
-
-char strToType(sview str) {
-    auto it = std::find_if(pieceNames.begin(), pieceNames.end(), [str] (const char* name) { return strncmp(name, str.ptr, str.size) == 0; });
-    return it != pieceNames.end() ? it - pieceNames.begin() : pieceNames.size();
-}
-
-// directions
-using Direction = std::array<char, 2>;
-
-constexpr Direction N   = {(char)  0, (char)  1};
-constexpr Direction NNE = {(char)  1, (char)  2};
-constexpr Direction NE  = {(char)  1, (char)  1};
-constexpr Direction ENE = {(char)  2, (char)  1};
-constexpr Direction E   = {(char)  1, (char)  0};
-constexpr Direction ESE = {(char)  2, (char) -1};
-constexpr Direction SE  = {(char)  1, (char) -1};
-constexpr Direction SSE = {(char)  1, (char) -2};
-constexpr Direction S   = {(char)  0, (char) -1};
-constexpr Direction SSW = {(char) -1, (char) -2};
-constexpr Direction SW  = {(char) -1, (char) -1};
-constexpr Direction WSW = {(char) -2, (char) -1};
-constexpr Direction W   = {(char) -1, (char)  0};
-constexpr Direction WNW = {(char) -2, (char)  1};
-constexpr Direction NW  = {(char) -1, (char)  1};
-constexpr Direction NNW = {(char) -1, (char)  2};
-
-char makeStep(char pos, const Direction& d) {
-    char next[2] = { char(positions[pos][R] + d[R]) , char(positions[pos][F] + d[F]) };
-    return strToPos(sview{next, sizeof(next)});
-}
-
-template<class Modifier>
-char traverse(char pos, const Direction& d, const Modifier& m, int count = 8) {
-    while (--count >= 0) {
-        pos = makeStep(pos, d);
-        if (pos == INVALID_POS || m(pos)) break;
-    }
-    return pos;
-}
-
-Direction normalize(const Direction& distance) {
-    //return {char((distance[R] > 0) - (distance[R] < 0)), char((distance[F] > 0) - (distance[F] < 0))};
-    const int drp = distance[R] > 0 ? 1 : 0;
-    const int drn = distance[R] < 0 ? 1 : 0;
-    const int dfp = distance[F] > 0 ? 1 : 0;
-    const int dfn = distance[F] < 0 ? 1 : 0;
-    return {char(drp - drn), char(dfp - dfn)};
-}
-
-struct Pin {
-    Direction d;
-    Piece* pinner;
-    Piece* pinned;
-};
-using Pins = std::list<Pin>;
-using Board = std::array<Piece*, 64>;
-
-std::vector<Direction> filter(const Direction& pin, std::initializer_list<Direction> directions) {
-    if (pin[R] == 0 && pin[F] == 0) return directions;
-    std::vector<Direction> result;
-    for (auto& d : directions) {
-        if ((d[R] == pin[R] || d[R] == -pin[R]) && (d[F] == pin[F] || d[F] == -pin[F])) result.push_back(d);
-    }
-    return result;
-}
-}
-
-class Piece {
-public:
-    enum Types : char {
-        Pawn,
-        Knight,
-        Bishop,
-        Rook,
-        Queen,
-        King,
-        //
-        NUM_PIECES
-    };
-
-    enum Colors : char {
-        White,
-        Black,
-    };
-
-    const char* name() const;
-    char initial() const;
-    Types type() const { return m_type; }
-    Colors color() const { return m_color; }
-    char pos() const { return m_pos; }
-    void setPos(char pos) {
-        m_pos = pos;
-        invalidate();
-    }
-    const char* coord() const;
-    const std::set<char>& allowed() const { return m_allowed; }
-    bool canReach(char pos) const;
-    virtual bool movePattern(char pos) const = 0;
-    void take();
-    virtual void reinit(const State& state) = 0;
-    void invalidate();
-protected:
-    Piece(Types type, Colors color, char pos, std::set<char> allowed)
-        : m_type(type), m_color(color), m_pos(pos), m_allowed(std::move(allowed)) {}
-    Piece(const Piece&) = delete;
-    ~Piece() = default;
-
-    const Types m_type;
-    const Colors m_color;
-    char m_pos;
-    std::set<char> m_allowed;
-    bool m_update = false;
-};
-
-struct Pawn : public Piece {
-    Pawn(Colors color, char pos, std::set<char> next) : Piece(Types::Pawn, color, pos, std::move(next)) {}
-
-    bool is_first_move() const {
-        return m_color ? coord()[F] == '7' : coord()[F] == '2';
-    }
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        Direction distance = {char(next[R] - cur[R]), char(next[F] - cur[F])};
-        char forward = m_color ? -1 : 1;
-        return (forward == distance[F] && distance[R] * distance[R] <= 1)
-            || (is_first_move() && 2 * forward == distance[F] && distance[R] == 0);
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct Knight : public Piece {
-    Knight(Colors color, char pos, std::set<char> next) : Piece(Types::Knight, color, pos, std::move(next)) {}
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        Direction diff = {char(next[R] - cur[R]), char(next[F] - cur[F])};
-        return diff[R]*diff[R] + diff[F]*diff[F] == 5;
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct Bishop : public Piece {
-    Bishop(Colors color, char pos) : Piece(Types::Bishop, color, pos, {}) {}
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        return cur[R] - cur[F] == next[R] - next[F] || cur[R] + cur[F] == next[R] + next[F];
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct Rook : public Piece {
-    Rook(Colors color, char pos) : Piece(Types::Rook, color, pos, {}) {}
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        return cur[R] == next[R] || cur[F] == next[F];
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct Queen : public Piece {
-    Queen(Colors color, char pos) : Piece(Types::Queen, color, pos, {}) {}
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        return cur[R] == next[R] || cur[F] == next[F] || cur[R] - cur[F] == next[R] - next[F] || cur[R] + cur[F] == next[R] + next[F];
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct King : public Piece {
-    King(Colors color, char pos) : Piece(Types::King, color, pos, {}) {}
-
-    virtual bool movePattern(char pos) const override {
-        if (m_pos == INVALID_POS) return false;
-        auto cur = coord();
-        auto next = positions[pos];
-        Direction diff = {char(next[R] - cur[R]), char(next[F] - cur[F])};
-        return diff[R]*diff[R] + diff[F]*diff[F] <= 2;
-    }
-
-    virtual void reinit(const State& state) override;
-};
-
-struct PieceSet {
-    Piece* begin() { return &p1; }
-    Piece* end() { return &r2 + 1; }
-    const Piece* begin() const { return &p1; }
-    const Piece* end() const { return &r2 + 1; }
-    Piece& operator[](int i) { return *(begin() + i); }
-    const Piece& operator[](int i) const { return *(begin() + i); }
-
-    Pawn   p1;
-    Pawn   p2;
-    Pawn   p3;
-    Pawn   p4;
-    Pawn   p5;
-    Pawn   p6;
-    Pawn   p7;
-    Pawn   p8;
-    Rook   r1;
-    Knight n1;
-    Bishop b1;
-    Queen  q;
-    King   k;
-    Bishop b2;
-    Knight n2;
-    Rook   r2;
-};
-
-struct State {
-    State();
-    PieceSet blacks;
-    PieceSet whites;
-    Board board;
-    Pins blackPins;
-    Pins whitePins;
-};
-
-Direction findPin(const Piece& piece, const State& state) {
-    auto& pins = piece.color() ? state.blackPins : state.whitePins;
-    auto it = std::find_if(pins.begin(), pins.end(), [&] (const Pin& pin) { return pin.pinned == &piece; });
-    if (it != pins.end()) return it->d;
-    return {0, 0};
-}
-
-struct Find {
-    Find(const Board& board) : m_board(board) {}
-    bool operator() (char pos) const { return m_board[pos]; }
-    const Board& m_board;
-};
-
-struct Add {
-    Add(const Board& board, std::set<char>& moves, Piece::Colors color) : m_board(board), m_moves(moves), m_color(color) {}
-    bool operator() (char pos) const {
-        if (!m_board[pos] || m_board[pos]->color() != m_color) m_moves.insert(pos);
-        return m_board[pos];
-    }
-    const Board& m_board;
-    std::set<char>& m_moves;
-    Piece::Colors m_color;
-};
-
-void Pawn::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-
-    auto pin = findPin(*this, state);
-
-    auto & left = m_color ? SW : NW;
-    auto & right = m_color ? SE : NE;
-
-    for (auto& direction : filter(pin, { left, right })) {
-        auto pos = makeStep(m_pos, direction);
-        if (pos != INVALID_POS && state.board[pos] && state.board[pos]->color() != m_color) m_allowed.insert(pos);
-    }
-
-    auto & forward = m_color ? S : N;
-    if (!filter(pin, {forward}).empty()) {
-        traverse(m_pos, forward, [&] (char pos) {
-                if (!state.board[pos]) m_allowed.insert(pos);
-                return state.board[pos] || !is_first_move();
-            }, 2);
-    }
-}
-
-void Knight::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-    auto pin = findPin(*this, state);
-    if (pin[R] != 0 || pin[F] != 0) return;
-    for (auto& direction : { NNE, ENE, ESE, SSE, SSW, WSW, WNW, NNW }) {
-        auto pos = makeStep(m_pos, direction);
-        if (pos != INVALID_POS && (!state.board[pos] || state.board[pos]->color() != m_color)) m_allowed.insert(pos);
-    }
-}
-
-void Bishop::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-    auto pin = findPin(*this, state);
-    for (auto& direction : filter(pin, { NE, SE, SW, NW })) {
-        traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
-    }
-}
-
-void Rook::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-    auto pin = findPin(*this, state);
-    for (auto& direction : filter(pin, { N, E, S, W })) {
-        traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
-    }
-}
-
-void Queen::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-    auto pin = findPin(*this, state);
-    for (auto& direction : filter(pin, { N, NE, E, SE, S, SW, W, NW })) {
-        traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
-    }
-}
-
-void King::reinit(const State& state) {
-    if (m_pos == INVALID_POS) return;
-    if (!m_update) return;
-    m_update = false;
-    m_allowed.clear();
-    auto& enemyPieces = m_color ? state.whites : state.blacks;
-    auto& pawnAttackLeft = m_color ? SW : NW;
-    auto& pawnAttackRight = m_color ? SE : NE;
-    for (auto& direction : { N, NE, E, SE, S, SW, W, NW }) {
-        auto pos = makeStep(m_pos, direction);
-        bool accept = pos != INVALID_POS && !(state.board[pos] && state.board[pos]->color() == m_color);
-        if (accept) {
-            for (auto& p : enemyPieces) {
-                if (!p.movePattern(pos)) continue;
-                if (p.type() == Piece::Knight || p.type() == Piece::King) {
-                    accept = false;
-                    break;
-                }
-                else if (p.type() == Piece::Pawn) {
-                    auto from = positions[pos];
-                    auto to = p.coord();
-                    Direction d {char(to[R] - from[R]), char(to[F] - from[F])};
-                    if (d == pawnAttackLeft || d == pawnAttackRight) {
-                        accept = false;
-                        break;
-                    }
-                }
-                else {
-                    auto from = positions[pos];
-                    auto to = p.coord();
-                    Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
-                    auto reached = traverse(pos, d, Find(state.board));
-                    if (p.pos() == reached) {
-                        accept = false;
-                        break;
-                    }
-                }
-            }
-        }
-        if (accept) m_allowed.insert(pos);
-    }
-}
-
-const char* Piece::name() const {
-    static_assert(pieceNames.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
-    return pieceNames[m_type];
-}
-
-char Piece::initial() const {
-    static_assert(blackShort.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
-    static_assert(whiteShort.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
-    return m_color ? blackShort[m_type] : whiteShort[m_type];
-}
-
-void Piece::invalidate() {
-    m_update = true;
-}
-
-
-const char* Piece::coord() const {
-    if (m_pos == INVALID_POS) return "";
-    return positions[m_pos];
-}
-
-bool Piece::canReach(char pos) const {
-    return movePattern(pos) && m_allowed.count(pos);
-}
-
-void Piece::take() {
-    m_pos = INVALID_POS;
-    m_allowed = {};
-}
-
-State::State()
-    : blacks {
-        {Piece::Black, "a7"_P, {"a5"_P, "a6"_P} },
-        {Piece::Black, "b7"_P, {"b5"_P, "b6"_P} },
-        {Piece::Black, "c7"_P, {"c5"_P, "c6"_P} },
-        {Piece::Black, "d7"_P, {"d5"_P, "d6"_P} },
-        {Piece::Black, "e7"_P, {"e5"_P, "e6"_P} },
-        {Piece::Black, "f7"_P, {"f5"_P, "f6"_P} },
-        {Piece::Black, "g7"_P, {"g5"_P, "g6"_P} },
-        {Piece::Black, "h7"_P, {"h5"_P, "h6"_P} },
-        {Piece::Black, "a8"_P},
-        {Piece::Black, "b8"_P, {"a6"_P, "c6"_P} },
-        {Piece::Black, "c8"_P},
-        {Piece::Black, "d8"_P},
-        {Piece::Black, "e8"_P},
-        {Piece::Black, "f8"_P},
-        {Piece::Black, "g8"_P, {"f6"_P, "h6"_P} },
-        {Piece::Black, "h8"_P},
-    }
-    , whites {
-        {Piece::White, "a2"_P, {"a3"_P, "a4"_P} },
-        {Piece::White, "b2"_P, {"b3"_P, "b4"_P} },
-        {Piece::White, "c2"_P, {"c3"_P, "c4"_P} },
-        {Piece::White, "d2"_P, {"d3"_P, "d4"_P} },
-        {Piece::White, "e2"_P, {"e3"_P, "e4"_P} },
-        {Piece::White, "f2"_P, {"f3"_P, "f4"_P} },
-        {Piece::White, "g2"_P, {"g3"_P, "g4"_P} },
-        {Piece::White, "h2"_P, {"h3"_P, "h4"_P} },
-        {Piece::White, "a1"_P},
-        {Piece::White, "b1"_P, {"a3"_P, "c3"_P} },
-        {Piece::White, "c1"_P},
-        {Piece::White, "d1"_P},
-        {Piece::White, "e1"_P},
-        {Piece::White, "f1"_P},
-        {Piece::White, "g1"_P, {"f3"_P, "h3"_P} },
-        {Piece::White, "h1"_P},
-    }
-    , board {{
-        &whites[ 8],  &whites[ 9],  &whites[10],  &whites[11],  &whites[12],  &whites[13],  &whites[14],  &whites[15],
-        &whites[ 0],  &whites[ 1],  &whites[ 2],  &whites[ 3],  &whites[ 4],  &whites[ 5],  &whites[ 6],  &whites[ 7],
-        nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,
-        nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,
-        nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,
-        nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,      nullptr,
-        &blacks[ 0],  &blacks[ 1],  &blacks[ 2],  &blacks[ 3],  &blacks[ 4],  &blacks[ 5],  &blacks[ 6],  &blacks[ 7],
-        &blacks[ 8],  &blacks[ 9],  &blacks[10],  &blacks[11],  &blacks[12],  &blacks[13],  &blacks[14],  &blacks[15],
-    }}
-{}
-
-Chessboard::Chessboard()
-    : m_state(new State())
-{
-    setGrammar();
-}
-
-Chessboard::~Chessboard() = default;
-
-void Chessboard::setPrompt(const std::string& prompt) {
-    m_prompt = prompt;
-    setGrammar();
-}
-
-void Chessboard::setGrammar() {
-    m_grammar.clear();
-
-    std::string result;
-    if (m_prompt.empty()) {
-        result += "move ::= \" \" ((piece | frompos) \" \" \"to \"?)? topos\n";
-        //result += "move ::= \" \" frompos \" \" \"to \"? topos\n";
-    }
-    else {
-        // result += "move ::= prompt \" \" ((piece | frompos) \" \" \"to \"?)? topos\n"
-        result += "move ::= prompt \" \" frompos \" \" \"to \"? topos\n"
-        "prompt ::= \" " + m_prompt + "\"\n";
-    }
-
-    std::set<Piece::Types> pieceTypes;
-    std::set<char> from_pos;
-    std::set<char> to_pos;
-    auto& pieces =  m_moveCounter % 2 ? m_state->blacks : m_state->whites;
-    std::set<size_t> flags;
-    for (auto& p : pieces) {
-        if (p.allowed().empty()) continue;
-        bool addPiece = false;
-        if (!m_inCheck || p.type() == Piece::King) {
-            to_pos.insert(p.allowed().begin(), p.allowed().end());
-            addPiece = !p.allowed().empty();
-        }
-        else {
-            for (auto move : p.allowed()) {
-                if (m_allowedInCheck.count(move)) {
-                    to_pos.insert(move);
-                    addPiece = true;
-                }
-            }
-        }
-        if (addPiece) {
-            pieceTypes.insert(p.type());
-            from_pos.insert(p.pos());
-        }
-    }
-    if (pieceTypes.empty()) return;
-
-    result += "piece ::= (";
-    for (auto& p : pieceTypes) result += " \"" + std::string(pieceNames[p]) + "\" |";
-    result.pop_back();
-    result += ")\n\n";
-
-    result += "frompos ::= (";
-    for (auto& p : from_pos) result += " \"" + std::string(positions[p]) + "\" |";
-    result.pop_back();
-    result += ")\n";
-
-    result += "topos ::= (";
-    for (auto& p : to_pos) result += " \"" + std::string(positions[p]) + "\" |";
-    result.pop_back();
-    result += ")\n";
-
-    m_grammar = std::move(result);
-}
-
-std::string Chessboard::stringifyBoard() {
-    std::string result;
-    result.reserve(16 + 2 * 64 + 16);
-    for (char rank = 'a'; rank <= 'h'; ++rank) {
-        result.push_back(rank);
-        result.push_back(' ');
-    }
-    result.back() = '\n';
-    for (int i = 7; i >= 0; --i) {
-        for (int j = 0; j < 8; ++j) {
-            auto p = m_state->board[i * 8 + j];
-            if (p) result.push_back(p->initial());
-            else result.push_back((i + j) % 2 ? '.' : '*');
-            result.push_back(' ');
-        }
-        result.push_back('0' + i + 1);
-        result.push_back('\n');
-    }
-    return result;
-}
-
-std::string Chessboard::process(const std::string& command) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-    auto color = Piece::Colors(m_moveCounter % 2);
-    Piece* piece = nullptr;
-    auto pos_to = INVALID_POS;
-    if (!parseCommand(command, piece, pos_to)) return "";
-
-    auto pos_from = piece->pos();
-
-    if (!move(*piece, pos_to)) return "";
-
-    flagUpdates(pos_from, pos_to);
-
-    detectChecks();
-
-    auto& enemyPieces = color ? m_state->whites : m_state->blacks;
-    for (auto& p : enemyPieces) p.reinit(*m_state); // only enemy moves needed next
-
-    std::string result = {positions[pos_from][R], positions[pos_from][F], '-', positions[pos_to][R], positions[pos_to][F]};
-    ++m_moveCounter;
-    setGrammar();
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    auto t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-    fprintf(stdout, "%s: Move '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", result.data(), "\033[0m", (int) t_ms);
-    if (m_grammar.empty()) result.push_back('#');
-    return result;
-}
-
-bool Chessboard::parseCommand(const std::string& command, Piece*& piece, char& pos_to) {
-    auto color = Piece::Colors(m_moveCounter % 2);
-    fprintf(stdout, "%s: Command to %s: '%s%.*s%s'\n", __func__, (color ? "Black" : "White"), "\033[1m", int(command.size()), command.data(), "\033[0m");
-
-    if (command.empty()) return false;
-    auto tokens = split(command, ' ');
-    auto pos_from = INVALID_POS;
-    auto type = Piece::Types::NUM_PIECES;
-    if (tokens.size() == 1) {
-        type = Piece::Types::Pawn;
-        pos_to = strToPos(tokens.front());
-    }
-    else {
-        pos_from = strToPos(tokens.front());
-        if (pos_from == INVALID_POS) type = Piece::Types(strToType(tokens.front()));
-        pos_to = strToPos(tokens.back());
-    }
-    if (pos_to == INVALID_POS) return false;
-    if (pos_from == INVALID_POS) {
-        if (type == Piece::Types::NUM_PIECES) return false;
-        auto& pieces = color ? m_state->blacks : m_state->whites;
-        for (auto& p : pieces) {
-            if (p.type() == type && p.canReach(pos_to)) {
-                pos_from = p.pos();
-                break;
-            }
-        }
-    }
-    if (pos_from == INVALID_POS) return false;
-    if (m_state->board[pos_from] == nullptr) return false;
-    piece = m_state->board[pos_from];
-    if (piece->color() != color) return false;
-    return true;
-}
-
-void Chessboard::flagUpdates(char pos_from, char pos_to) {
-    auto color = Piece::Colors(m_moveCounter % 2);
-    auto& enemyPieces = color ? m_state->whites : m_state->blacks;
-    auto& ownPieces = color ? m_state->blacks : m_state->whites;
-    for (auto& p : enemyPieces) {
-        if (p.movePattern(pos_to) || p.movePattern(pos_from)) {
-            updatePins(p);
-            p.invalidate();
-        }
-    }
-
-    for (auto& p : ownPieces) {
-        if (p.movePattern(pos_to) || p.movePattern(pos_from)) {
-            updatePins(p);
-            p.invalidate();
-        }
-    }
-}
-
-void Chessboard::updatePins(Piece& piece) {
-    if (piece.type() == Piece::Pawn || piece.type() == Piece::Knight || piece.type() == Piece::King) return;
-    auto& enemyPieces = piece.color() ? m_state->whites : m_state->blacks;
-    auto& enemyPins = piece.color() ? m_state->whitePins : m_state->blackPins;
-    auto& king = enemyPieces.k;
-    auto it = std::find_if(enemyPins.begin(), enemyPins.end(), [&] (const Pin& pin) { return pin.pinner == &piece; });
-    if (it != enemyPins.end()) {
-        it->pinned->invalidate();
-        enemyPins.erase(it);
-    }
-    if (piece.movePattern(king.pos())) {
-        auto to = positions[king.pos()];
-        auto from = piece.coord();
-        Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
-
-        auto reached = traverse(piece.pos(), d, Find(m_state->board));
-        auto foundPiece = m_state->board[reached];
-        if (&king == foundPiece) {
-            // check
-            king.invalidate();
-        }
-        else if (foundPiece && foundPiece->color() != piece.color()) {
-            reached = traverse(reached, d, Find(m_state->board));
-            if (&king == m_state->board[reached]) {
-                enemyPins.push_back({d, &piece, foundPiece});
-                foundPiece->invalidate();
-            }
-        }
-    }
-}
-
-void Chessboard::detectChecks() {
-    auto color = Piece::Colors(m_moveCounter % 2);
-    auto& enemyPieces = color ? m_state->whites : m_state->blacks;
-    auto& ownPieces = color ? m_state->blacks : m_state->whites;
-    auto& king = enemyPieces.k;
-    auto& pawnAttackLeft = color ? SW : NW;
-    auto& pawnAttackRight = color ? SE : NE;
-    for (auto& p : ownPieces) {
-        if (!p.movePattern(king.pos())) continue;
-        auto to = positions[king.pos()];
-        auto from = p.coord();
-
-        if (p.type() == Piece::Knight) {
-            if (!m_inCheck) {
-                m_allowedInCheck = { p.pos() };
-            }
-            else {
-                m_allowedInCheck.clear();
-            }
-            m_inCheck = true;
-        }
-        else if (p.type() == Piece::Pawn) {
-            Direction d {char(to[R] - from[R]), char(to[F] - from[F])};
-            if (d == pawnAttackLeft || d == pawnAttackRight) {
-                if (!m_inCheck) {
-                    m_allowedInCheck = { p.pos() };
-                }
-                else {
-                    m_allowedInCheck.clear();
-                }
-                m_inCheck = true;
-            }
-        }
-        else {
-            Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
-            std::set<char> tmp;
-            auto pos = traverse(p.pos(), d, Add(m_state->board, tmp, king.color()));
-            if (pos == king.pos()) {
-                tmp.insert(p.pos());
-                if (!m_inCheck) {
-                    m_allowedInCheck = std::move(tmp);
-                }
-                else {
-                    m_allowedInCheck.clear();
-                }
-                m_inCheck = true;
-            }
-        }
-    }
-}
-
-bool Chessboard::move(Piece& piece, char pos_to) {
-    auto& allowed = piece.allowed();
-
-    if (allowed.count(pos_to) == 0 || (m_inCheck && piece.type() != Piece::King && m_allowedInCheck.count(pos_to) == 0)) return false;
-    if (m_state->board[pos_to] && m_state->board[pos_to]->color() == piece.color()) return false;
-    if (m_state->board[pos_to]) m_state->board[pos_to]->take();
-    m_state->board[piece.pos()] = nullptr;
-    m_state->board[pos_to] = &piece;
-    piece.setPos(pos_to);
-
-    m_inCheck = false;
-    m_allowedInCheck.clear();
-
-    return true;
-}
--- a/examples/wchess/libwchess/Chessboard.h
+++ b/examples/wchess/libwchess/Chessboard.h
@ -1,33 +0,0 @@
-#pragma once
-#include <string>
-#include <set>
-#include <memory>
-
-// just basic validation
-// fixme: missing en passant, castling, promotion, etc.
-struct State;
-class Piece;
-class Chessboard {
-public:
-    Chessboard();
-    ~Chessboard();
-    std::string process(const std::string& command);
-    std::string stringifyBoard();
-    const std::string& grammar() { return m_grammar; }
-    const std::string& prompt() { return m_prompt; }
-    void setPrompt(const std::string& prompt);
-private:
-    bool parseCommand(const std::string& command, Piece*& piece, char& pos_to);
-    bool move(Piece& piece, char pos);
-    void flagUpdates(char pos_from, char pos_to);
-    void updatePins(Piece& piece);
-    void detectChecks();
-    void setGrammar();
-
-    std::unique_ptr<State> m_state;
-    std::set<char> m_allowedInCheck;
-    bool m_inCheck = false;
-    int m_moveCounter = 0;
-    std::string m_grammar;
-    std::string m_prompt;
-};
--- a/examples/wchess/libwchess/WChess.cpp
+++ b/examples/wchess/libwchess/WChess.cpp
@ -1,193 +0,0 @@
-#include "WChess.h"
-#include "Chessboard.h"
-#include "grammar-parser.h"
-#include "common.h"
-#include <thread>
-
-WChess::WChess(whisper_context * ctx,
-        const whisper_full_params & wparams,
-        callbacks cb,
-        settings s)
-        : m_ctx(ctx)
-        , m_wparams(wparams)
-        , m_cb(cb)
-        , m_settings(s)
-        , m_board(new Chessboard())
-{}
-
-WChess::~WChess() = default;
-
-void WChess::set_move(const std::string& moves, float prob) const {
-    if (m_cb.set_move) (*m_cb.set_move)(moves, prob);
-}
-
-void WChess::set_grammar(const std::string& grammar) const {
-    if (m_cb.set_grammar) (*m_cb.set_grammar)(grammar);
-}
-
-bool WChess::get_audio(std::vector<float>& pcmf32) const {
-    if (m_cb.get_audio) return (*m_cb.get_audio)(pcmf32);
-    return false;
-}
-
-std::string WChess::stringify_board() const {
-    return m_board->stringifyBoard();
-}
-
-std::string WChess::get_grammar() const {
-    return m_board->grammar();
-}
-
-void WChess::run() {
-    bool have_prompt  = true;
-    bool ask_prompt   = !have_prompt;
-
-    float logprob_min  = 0.0f;
-
-    float logprob_sum  = 0.0f;
-
-    int n_tokens  = 0;
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    const std::string k_prompt = have_prompt ? "" : "rook to d4, f3";
-    int64_t t_ms = 0;
-
-    if (ask_prompt) {
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
-        fprintf(stdout, "\n");
-
-        ask_prompt = false;
-    }
-
-    while (get_audio(pcmf32_cur)) {
-        if (!pcmf32_cur.empty()) {
-            // fprintf(stdout, "%s: Processing ...\n", __func__);
-
-            if (!have_prompt) {
-                const auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
-
-                fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
-
-                const float sim = similarity(txt, k_prompt);
-
-                if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
-                    fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
-                    ask_prompt = true;
-                } else {
-                    fprintf(stdout, "\n");
-                    fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
-                    fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
-                    fprintf(stdout, "\n");
-
-                    // save the audio for the prompt
-                    pcmf32_prompt = pcmf32_cur;
-                    have_prompt = true;
-                    m_board->setPrompt(k_prompt);
-                }
-            } else {
-                if (!pcmf32_prompt.empty()) pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
-                constexpr size_t MIN_SIZE = 1.2 * WHISPER_SAMPLE_RATE;
-                if (MIN_SIZE > pcmf32_cur.size()) pcmf32_cur.insert(pcmf32_cur.begin(), MIN_SIZE - pcmf32_cur.size(), 0.0f);
-
-                // fprintf(stdout, "%s: grammar rules:\n'%s'\n", __func__, m_board->grammar().c_str());
-
-                auto grammar_parsed = grammar_parser::parse(m_board->grammar().c_str());
-                auto grammar_rules  = grammar_parsed.c_rules();
-
-                m_wparams.grammar_rules   = grammar_rules.data();
-                m_wparams.n_grammar_rules = grammar_rules.size();
-
-                m_wparams.i_start_rule    = grammar_parsed.symbol_ids.at("move");
-                auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
-
-                const float p = 100.0f * std::exp(logprob_min);
-
-                fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
-
-                // find the prompt in the text
-                float best_sim = 0.0f;
-                size_t best_len = 0;
-                for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
-                    const auto prompt = txt.substr(0, n);
-
-                    const float sim = similarity(prompt, k_prompt);
-
-                    //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
-
-                    if (sim > best_sim) {
-                        best_sim = sim;
-                        best_len = n;
-                    }
-                }
-
-                fprintf(stdout, "%s:   DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
-                std::string command = ::trim(txt.substr(best_len));
-
-                fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
-                fprintf(stdout, "\n");
-
-                if (!command.empty()) {
-                    set_move(m_board->process(command), p);
-                    set_grammar(m_board->grammar());
-                }
-                if (m_board->grammar().empty()) {
-                    fprintf(stdout, "%s: No more moves possible\n", __func__);
-                    break;
-                }
-            }
-        }
-
-        if (ask_prompt) {
-            fprintf(stdout, "\n");
-            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
-            fprintf(stdout, "\n");
-
-            ask_prompt = false;
-        }
-    }
-}
-
-std::string WChess::transcribe(
-                const std::vector<float> & pcmf32,
-                float & logprob_min,
-                float & logprob_sum,
-                int & n_tokens,
-                int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    logprob_min = 0.0f;
-    logprob_sum = 0.0f;
-    n_tokens    = 0;
-    t_ms = 0;
-
-    if (whisper_full(m_ctx, m_wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return {};
-    }
-
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(m_ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(m_ctx, i);
-
-        result += text;
-
-        const int n = whisper_full_n_tokens(m_ctx, i);
-        for (int j = 0; j < n; ++j) {
-            const auto token = whisper_full_get_token_data(m_ctx, i, j);
-
-            if(token.plog > 0.0f) return {};
-            logprob_min = std::min(logprob_min, token.plog);
-            logprob_sum += token.plog;
-            ++n_tokens;
-        }
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
--- a/examples/wchess/libwchess/WChess.h
+++ b/examples/wchess/libwchess/WChess.h
@ -1,63 +0,0 @@
-#pragma once
-#include "whisper.h"
-#include <string>
-#include <vector>
-#include <memory>
-
-class Chessboard;
-
-class WChess {
-public:
-    using CheckRunningCb = bool (*)();
-    using GetAudioCb = bool (*)(std::vector<float> &);
-    using SetMovesCb = void (*)(const std::string &, float);
-    using SetGrammarCb = void (*)(const std::string &);
-    using ClearAudioCb = void (*)();
-
-    struct callbacks {
-        GetAudioCb get_audio = nullptr;
-        SetMovesCb set_move = nullptr;
-        SetGrammarCb set_grammar = nullptr;
-    };
-
-    struct settings {
-        int32_t vad_ms     = 2000;
-        int32_t prompt_ms  = 5000;
-        int32_t command_ms = 4000;
-        float vad_thold    = 0.2f;
-        float freq_thold   = 100.0f;
-        bool print_energy  = false;
-    };
-
-    WChess(
-        whisper_context * ctx,
-        const whisper_full_params & wparams,
-        callbacks cb,
-        settings s
-    );
-    ~WChess();
-
-    void run();
-
-    std::string stringify_board() const;
-
-    std::string get_grammar() const;
-
-private:
-    bool get_audio(std::vector<float>& pcmf32) const;
-    void set_move(const std::string& moves, float prob) const;
-    void set_grammar(const std::string& grammar) const;
-
-    std::string transcribe(
-                    const std::vector<float> & pcmf32,
-                    float & logprob_min,
-                    float & logprob_sum,
-                    int & n_tokens,
-                    int64_t & t_ms);
-
-    whisper_context * m_ctx;
-    whisper_full_params m_wparams;
-    const callbacks m_cb;
-    const settings m_settings;
-    std::unique_ptr<Chessboard> m_board;
-};
--- a/examples/wchess/libwchess/test-chessboard.cpp
+++ b/examples/wchess/libwchess/test-chessboard.cpp
@ -1,117 +0,0 @@
-#include "Chessboard.h"
-
-#define ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            fflush(stderr); \
-            exit(1); \
-        } \
-    } while (0)
-
-
-int main() {
-    {
-        Chessboard chess;
-
-        ASSERT(chess.process("pawn to d4") == "d2-d4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("c1 h6") == "c1-h6");
-        ASSERT(chess.process("queen h4") == "d8-h4");
-        ASSERT(chess.process("bishop to g5") == "h6-g5");
-        ASSERT(chess.process("bishop to b4") == "f8-b4");
-        ASSERT(chess.process("c4") == "");
-        ASSERT(chess.process("knight c3") == "b1-c3");
-        ASSERT(chess.process("knight c6") == "b8-c6");
-        ASSERT(chess.process("f3") == "");
-    }
-
-    {
-        Chessboard chess;
-
-        ASSERT(chess.process("d4") == "d2-d4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("queen h4") == "d8-h4");
-        ASSERT(chess.process("queen h5") == "d1-h5");
-        ASSERT(chess.process("f5") == "");
-        ASSERT(chess.process("g6") == "g7-g6");
-        ASSERT(chess.process("knight e2") == "g1-e2");
-        ASSERT(chess.process("f5") == "f7-f5");
-        ASSERT(chess.process("knight g3") == "e2-g3");
-        ASSERT(chess.process("g5") == "");
-        ASSERT(chess.process("king e7") == "e8-e7");
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("g5") == "g6-g5");
-    }
-
-    {
-        Chessboard chess;
-
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("c5") == "c7-c5");
-        ASSERT(chess.process("e5") == "e4-e5");
-        ASSERT(chess.process("c4") == "c5-c4");
-        ASSERT(chess.process("e6") == "e5-e6");
-        ASSERT(chess.process("c3") == "c4-c3");
-        ASSERT(chess.process("e7") == "");
-        ASSERT(chess.process("f7") == "e6-f7");
-        ASSERT(chess.process("d2") == "");
-        ASSERT(chess.process("king to f7") == "e8-f7");
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("d2") == "c3-d2");
-        ASSERT(chess.process("f5") == "");
-        ASSERT(chess.process("king to e2") == "e1-e2");
-        ASSERT(chess.process("king to g6") == "f7-g6");
-        ASSERT(chess.process("f5") == "f4-f5");
-        ASSERT(chess.process("e6") == "");
-        ASSERT(chess.process("king to h5") == "g6-h5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("king to g5") == "h5-g5");
-        ASSERT(chess.process("h4") == "h2-h4");
-        ASSERT(chess.process("king to h5") == "");
-        ASSERT(chess.process("king to g6") == "");
-        ASSERT(chess.process("king to h6") == "g5-h6");
-        ASSERT(chess.process("bishop to d2") == "c1-d2");
-        ASSERT(chess.process("king to g5") == "");
-        ASSERT(chess.process("g5") == "g7-g5");
-    }
-
-    {
-        Chessboard chess;
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("queen to h4") == "d8-h4#");
-        ASSERT(chess.process("knight f3") == "");
-        ASSERT(chess.grammar().empty());
-    }
-
-    {
-        Chessboard chess;
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("d5") == "d7-d5");
-        ASSERT(chess.process("g1 f3") == "g1-f3");
-        ASSERT(chess.process("queen to h4") == "d8-h4");
-        ASSERT(!chess.grammar().empty());
-    }
-
-    {
-        Chessboard chess;
-        ASSERT(chess.process("knight c3") == "b1-c3");
-        ASSERT(chess.process("knight c6") == "b8-c6");
-        ASSERT(chess.process("knight b5") == "c3-b5");
-        ASSERT(chess.process("knight f6") == "g8-f6");
-        ASSERT(chess.process("knight d6") == "b5-d6");
-        ASSERT(chess.process("knight d4") == "");
-        ASSERT(chess.process("d6") == "c7-d6");
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("knight d4") == "c6-d4");
-        ASSERT(chess.process("d3") == "d2-d3");
-        ASSERT(chess.process("knight e4") == "f6-e4");
-        ASSERT(chess.process("king to e2") == "");
-        ASSERT(chess.process("king to d2") == "");
-    }
-}
--- a/examples/wchess/wchess.cmd/CMakeLists.txt
+++ b/examples/wchess/wchess.cmd/CMakeLists.txt
@ -1,8 +0,0 @@
-if (WHISPER_SDL2)
-    set(TARGET wchess)
-    add_executable(${TARGET} wchess.cmd.cpp)
-
-    include(DefaultTargetOptions)
-
-    target_link_libraries(${TARGET} PRIVATE wchess-core common-sdl ${CMAKE_THREAD_LIBS_INIT})
-endif ()
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@ -1,247 +0,0 @@
-// Command line voice assisted chess
-//
-// Speak chess move commands to the microphone.
-// The moves will translated to chessboard positions.
-//
-//
-
-#include "WChess.h"
-#include "common-sdl.h"
-#include <iostream>
-
-#include <memory>
-#include <thread>
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t prompt_ms  = 5000;
-    int32_t command_ms = 8000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
-    int32_t audio_ctx  = 0;
-
-    float vad_thold  = 0.6f;
-    float freq_thold = 100.0f;
-
-    float grammar_penalty = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-    bool use_gpu       = true;
-
-    std::string language  = "en";
-    std::string model     = "models/ggml-base.en.bin";
-    std::string fname_out;
-    std::string commands;
-    std::string prompt;
-    std::string context;
-    std::string grammar;
-};
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,         --help           [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,       --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -pms N,     --prompt-ms N    [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
-    fprintf(stderr, "  -cms N,     --command-ms N   [%-7d] command duration in milliseconds\n",            params.command_ms);
-    fprintf(stderr, "  -c ID,      --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,      --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -ng,        --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "  -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n",             params.commands.c_str());
-    fprintf(stderr, "  -p,         --prompt         [%-7s] the required activation prompt\n",              params.prompt.c_str());
-    fprintf(stderr, "  -ctx,       --context        [%-7s] sample text to help the transcription\n",       params.context.c_str());
-    fprintf(stderr, "  --grammar-penalty N          [%-7.1f] scales down logits of nongrammar tokens\n",   params.grammar_penalty);
-    fprintf(stderr, "\n");
-}
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else if (arg == "-cmd" || arg == "--commands")      { params.commands      = argv[++i]; }
-        else if (arg == "-p"   || arg == "--prompt")        { params.prompt        = argv[++i]; }
-        else if (arg == "-ctx" || arg == "--context")       { params.context       = argv[++i]; }
-        else if (                 arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-std::unique_ptr<WChess> g_wchess;
-int g_moveCount = 0;
-void set_move(const std::string & move, float) {
-    if (!move.empty()) {
-        g_moveCount++;
-        fprintf(stdout, "Move: %s\n\n", move.c_str());
-    }
-    else fprintf(stdout, "Move rejected\n\n");
-    fprintf(stdout, "%s\n", g_wchess->stringify_board().c_str());
-    fprintf(stdout, "%s\n", g_moveCount ? "White's turn" : "Black's turn");
-}
-
-audio_async g_audio(30*1000);
-bool g_listening = false;
-std::vector<float> g_pcmf32;
-
-bool read_input() {
-    std::string input;
-    while (true) {
-        fprintf(stdout, "[(l)isten/(p)ause/(q)uit]: ");
-        std::cin >> input;
-        fprintf(stdout, "\n");
-        if (input[0] == 'q') {
-            fprintf(stdout, "Quitting\n");
-            return false;
-        }
-        if (input[0] == 'l') {
-            if (!g_listening) {
-                fprintf(stdout, "Listening\n");
-                g_listening = true;
-                g_pcmf32.clear();
-                g_audio.resume();
-                g_audio.clear();
-            }
-            else fprintf(stdout, "Still listening\n");
-            return true;
-        }
-        else {
-            if (g_listening) {
-                g_listening = false;
-                g_audio.get(0, g_pcmf32);
-                g_audio.pause();
-                fprintf(stdout, "Processing\n");
-            }
-            else fprintf(stdout, "Not listening\n");
-            return true;
-        }
-    }
-    return true;
-}
-
-bool get_audio(std::vector<float> & pcmf32_cur) {
-    if (!read_input()) return false;
-    if (!g_pcmf32.empty()) pcmf32_cur = std::move(g_pcmf32);
-    else pcmf32_cur.clear();
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
-
-    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-    if (!ctx) {
-        fprintf(stderr, "%s: whisper_init_from_file_with_params() failed!\n", __func__);
-        return 1;
-    }
-
-    // init audio
-
-    if (!g_audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-    wparams.offset_ms        = 0;
-    wparams.translate        = false;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.print_realtime   = false;
-    wparams.print_progress   = false;
-    wparams.print_timestamps = true;
-    wparams.print_special    = false;
-    wparams.no_timestamps    = true;
-
-    wparams.max_tokens       = 32;
-    wparams.audio_ctx        = 768; // partial encoder context for better performance
-
-    wparams.temperature     = 0.0f;
-    wparams.temperature_inc = 2.0f;
-    wparams.greedy.best_of  = 1;
-
-    wparams.beam_search.beam_size = 1;
-
-    wparams.language         = "en";
-
-    wparams.grammar_penalty = 100.0;
-
-    wparams.initial_prompt = params.context.data();
-
-    WChess::callbacks cb;
-    cb.get_audio = get_audio;
-    cb.set_move = set_move;
-
-    WChess::settings s;
-    s.vad_ms = 2000;
-    s.prompt_ms = params.prompt_ms;
-    s.command_ms = params.command_ms;
-    s.vad_thold = params.vad_thold;
-    s.freq_thold = params.freq_thold;
-    s.print_energy = params.print_energy;
-
-    g_wchess.reset(new WChess(ctx, wparams, cb, s));
-    set_move("start", 0);
-    g_wchess->run();
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}
--- a/examples/wchess/wchess.wasm/CMakeLists.txt
+++ b/examples/wchess/wchess.wasm/CMakeLists.txt
@ -1,51 +0,0 @@
-set(TARGET wchess.wasm)
-
-add_executable(${TARGET}
-    wchess.wasm.cpp
-    )
-
-include(DefaultTargetOptions)
-
-target_link_libraries(${TARGET} PRIVATE
-    common
-    wchess-core
-    )
-
-unset(EXTRA_FLAGS)
-
-if (WHISPER_WASM_SINGLE_FILE)
-    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
-    message(STATUS "Embedding WASM inside chess.js")
-
-    add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bin/${TARGET}.js
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/chess.js
-        )
-endif()
-
-set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
-    --bind \
-    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
-    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
-    ${EXTRA_FLAGS} \
-    ")
-
-
-add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${CMAKE_CURRENT_SOURCE_DIR}/chessboardjs-1.0.0
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_CURRENT_SOURCE_DIR}/jquery-3.7.1.min.js
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/
-    )
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
-configure_file(${CMAKE_SOURCE_DIR}/examples/helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/helpers.js @ONLY)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	5031f54717	whisper : try to fix the parallel whisper_state functionality (#1479 ) * whisper : try to fix the parallel whisper_state functionality * whisper : fix multi-state Metal * whisper : free backend instances in whisper_state	2023-11-12 14:52:38 +02:00
Georgi Gerganov	40c66036b6	whisper : fix UB with measure buffers	2023-11-11 18:35:23 +02:00
Georgi Gerganov	fc8565d0e2	whisper : fixes	2023-11-11 17:39:30 +02:00
Georgi Gerganov	b618229340	whisper : factor out graph compute in common function	2023-11-11 17:06:21 +02:00
Georgi Gerganov	b27726da93	whisper : add note that ggml_mul_mat_pad does not work with CUDA	2023-11-11 13:04:58 +02:00
Georgi Gerganov	0867e696a7	whisper : avoid whisper_model_data wrapper	2023-11-11 11:46:54 +02:00
Georgi Gerganov	66bb2e9401	ggml : im2col opts	2023-11-11 10:41:00 +02:00
Georgi Gerganov	3bfc43e3e3	quantize-all : fix	2023-11-10 23:33:40 +02:00
Georgi Gerganov	f53e1388f5	whisper : clean-up	2023-11-10 22:31:44 +02:00
Georgi Gerganov	933c5bef97	whisper : support ggml_conv with CUDA and Metal (#1473 ) * ggml : add CUDA support for ggml_conv * whisper : remove ggml_repeat for conv bias + single backend * cuda : fix im2col kernel * metal : add im2col support + mul mat-vec f16 x f16 * bench-all : add q4 models	2023-11-10 22:26:50 +02:00
Georgi Gerganov	c99e290a7f	talk : fix compile warning	2023-11-10 13:54:02 +02:00
Georgi Gerganov	728e1785f0	Merge branch 'master' into ggml-backend-no-sched	2023-11-10 13:51:31 +02:00
Georgi Gerganov	d6dad64fbf	make : clean-up	2023-11-10 13:45:07 +02:00
Georgi Gerganov	a54d8c9dec	whisper : fix CoreML	2023-11-10 13:24:06 +02:00
Georgi Gerganov	0ab5025316	Merge branch 'master' into ggml-backend-no-sched	2023-11-10 13:21:47 +02:00
Georgi Gerganov	3f5c1b7ee0	whisper : print when CUDA is enabled	2023-11-10 13:17:02 +02:00
Georgi Gerganov	12030358ee	whisper : free backends + fix compile warning	2023-11-10 12:45:26 +02:00
Georgi Gerganov	dcf9511dbb	whisper : fix beam-search with CUDA	2023-11-10 12:41:11 +02:00
Georgi Gerganov	3dfbe64911	whisper : fix tensor allocation during load	2023-11-10 11:51:55 +02:00
Georgi Gerganov	7e01486b61	whisper : fix logit reading	2023-11-10 11:02:29 +02:00
Georgi Gerganov	659757329d	whisper : migrate to ggml-backend	2023-11-10 10:54:06 +02:00