whisper : remove comment

command : enable beam-search, add "no_timestamps", add "context", add p
grammars : add assistant + update comments
2025-06-29 19:12:52 +00:00 · 2023-09-10 13:27:06 +03:00 · 2023-09-10 12:22:57 +03:00 · 2023-09-09 20:24:58 +03:00 · 2023-09-09 20:05:57 +03:00 · 2023-09-06 17:05:05 +03:00
53 changed files with 39686 additions and 5029 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,31 +1,41 @@
 name: CI
 on: [push, pull_request]

+env:
+  ubuntu_image: "ubuntu:22.04"
+
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest

+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+
    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install libsdl2-dev
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2

-      - name: Build
+      - name: Build ${{ matrix.arch }}
        run: |
-          make
-          make stream
+          docker run --platform ${{ matrix.arch }} --rm \
+            -v ${{ github.workspace }}:/workspace \
+            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            apt update
+            apt install -y build-essential libsdl2-dev
+            make
+            make stream'

  macOS-latest:
    runs-on: macOS-latest

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        run: |
@ -37,82 +47,104 @@ jobs:
          make
          make stream

+  freeBSD-latest:
+    runs-on: macos-12
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+
+      - name: Build
+        uses: cross-platform-actions/action@v0.15.0
+        with:
+          operating_system: freebsd
+          version: '13.2'
+          run: |
+            sudo pkg update
+            sudo pkg install -y gmake sdl2
+            gmake
+            gmake stream
+
  ubuntu-latest-gcc:
    runs-on: ubuntu-latest

    strategy:
+      fail-fast: false
      matrix:
        build: [Debug, Release]
+        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - name: Dependencies
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Build ${{ matrix.arch }}
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-          sudo apt-get install libsdl2-dev
-
-      - name: Configure
-        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-
-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+          docker run --platform ${{ matrix.arch }} --rm \
+            -v ${{ github.workspace }}:/workspace \
+            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            apt update
+            apt install -y build-essential cmake libsdl2-dev
+            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+            make
+            ctest -L gh --output-on-failure'

  ubuntu-latest-clang:
    runs-on: ubuntu-latest

    strategy:
+      fail-fast: false
      matrix:
        build: [Debug, Release]
+        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - name: Dependencies
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Build ${{ matrix.arch }}
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-          sudo apt-get install libsdl2-dev
-
-      - name: Configure
-        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-
-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+          docker run --platform ${{ matrix.arch }} --rm \
+            -v ${{ github.workspace }}:/workspace \
+            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            apt update
+            apt install -y build-essential cmake libsdl2-dev
+            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+            make
+            ctest -L gh --output-on-failure'

  ubuntu-latest-gcc-sanitized:
    runs-on: ubuntu-latest

    strategy:
+      fail-fast: false
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        arch: [linux/amd64]

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - name: Dependencies
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Build ${{ matrix.arch }}
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-          sudo apt-get install cmake
-
-      - name: Configure
-        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
-
-      - name: Build
-        run: |
-          make
-          ctest -L gh --output-on-failure
+          docker run --platform ${{ matrix.arch }} --rm \
+            -v ${{ github.workspace }}:/workspace \
+            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            apt update
+            apt install -y build-essential cmake
+            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
+            make
+            ctest -L gh --output-on-failure'

  windows:
    runs-on: windows-latest
@ -134,7 +166,7 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
@ -195,7 +227,7 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
@ -243,10 +275,10 @@ jobs:
        with:
          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}
-          
+
  windows-cublas:
    runs-on: windows-latest
-    
+
    strategy:
      matrix:
        build: [Release]
@ -258,40 +290,40 @@ jobs:
            s2arc: x64
          - sdl2: ON
            s2ver: 2.26.0
-            
+
    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
-        
+
      - name: Install CUDA Toolkit
        id: cuda-toolkit
        uses: Jimver/cuda-toolkit@v0.2.10
-        
+
      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
        run: |
          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
-        
+
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          -DWHISPER_CUBLAS=1
-      
+
      - name: Build
        run: |
          cd ./build
          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-          
+
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
-          
+
      - name: Upload binaries
        if: matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
@ -308,24 +340,16 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - name: Dependencies
-        run: |
-          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-          tar -xvf master.tar.gz
-          emsdk-master/emsdk update
-          emsdk-master/emsdk install latest
-          emsdk-master/emsdk activate latest
+      - name: Setup emsdk
+        uses: mymindstorm/setup-emsdk@v12

-      - name: Configure
-        run: echo "tmp"
+      - name: Verify
+        run: emcc -v

      - name: Build
        run: |
-          pushd emsdk-master
-          source ./emsdk_env.sh
-          popd
          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          make

@ -338,7 +362,7 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Configure
        run: |
@ -347,7 +371,7 @@ jobs:

      - name: Build objc example
        run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
-      
+
      - name: Build swiftui example
        run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build

@ -356,14 +380,14 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Install Java
        uses: actions/setup-java@v3
        with:
          distribution: zulu
          java-version: 17
-      
+
      - name: Setup Android SDK
        uses: android-actions/setup-android@v2

@ -376,7 +400,7 @@ jobs:
    needs: [ 'windows' ]
    runs-on: windows-latest
    steps:
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v3

      - name: Install Java
        uses: actions/setup-java@v1
@ -402,11 +426,27 @@ jobs:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar

-#      - name: Publish package
-#        if: ${{ github.ref == 'refs/heads/master' }}
-#        uses: gradle/gradle-build-action@v2
-#        with:
-#          arguments: publish
-#        env:
-#          MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
-#          MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
+      - name: Publish package
+        if: ${{ github.ref == 'refs/heads/master' }}
+        uses: gradle/gradle-build-action@v2
+        with:
+          arguments: publish
+          build-root-directory: bindings/java
+        env:
+          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
+          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
+          # MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
+          # MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
+
+  quantize:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+
+      - name: Test quantize
+        run: |
+          ./models/download-ggml-model.sh tiny.en
+          make quantize
+          ./quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ build/
 build-em/
 build-debug/
 build-release/
+build-rwdi/
 build-static/
 build-cublas/
 build-no-accel/
@ -24,6 +25,7 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
+/lsp

 arm_neon.h
 sync.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,6 +65,7 @@ else()
    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"     OFF)
    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"  OFF)
+    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS" OFF)
    option(WHISPER_CLBLAST               "whisper: use CLBlast"         OFF)
 endif()

@ -136,22 +137,34 @@ if (WHISPER_OPENBLAS)
 endif()

 if (WHISPER_BLAS)
-    set(BLA_STATIC 1)
-    set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
-#    set(BLA_PREFER_PKGCONFIG 1)
-    set(BLA_SIZEOF_INTEGER 8)
-    find_package(BLAS)
+    if (WIN32)
+        if(DEFINED ENV{OPENBLAS_PATH})
+            set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
+            message(STATUS "Libraries ${BLAS_LIBRARIES}")
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+            include_directories($ENV{OPENBLAS_PATH}/include)
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
+        else ()
+            message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
+        endif ()
+    else ()
+        set(BLA_STATIC 1)
+        set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
+        #    set(BLA_PREFER_PKGCONFIG 1)
+        set(BLA_SIZEOF_INTEGER 8)
+        find_package(BLAS)

-    if(BLAS_FOUND)
-        message(STATUS "BLAS compatible library found")
-        message(STATUS "Libraries ${BLAS_LIBRARIES}")
-        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-
-        include_directories(${BLAS_INCLUDE_DIRS})
-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
-    else()
-        message(WARNING "BLAS library was not found")
-    endif()
+        if(BLAS_FOUND)
+            message(STATUS "BLAS compatible library found")
+            message(STATUS "Libraries ${BLAS_LIBRARIES}")
+            find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+            include_directories(${BLAS_INCLUDE_DIRS})
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
+        else()
+            message(WARNING "BLAS library was not found")
+        endif()
+    endif ()
 endif ()

 if (WHISPER_CUBLAS)
@ -179,6 +192,37 @@ if (WHISPER_CUBLAS)
    endif()
 endif()

+
+if (WHISPER_HIPBLAS)
+    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    endif()
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+    endif()
+
+    find_package(hip)
+    find_package(hipblas)
+    find_package(rocblas)
+
+    if (${hipblas_FOUND} AND ${hip_FOUND})
+        message(STATUS "HIP and hipBLAS found")
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+
+        if (WHISPER_STATIC)
+            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+        endif()
+        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
+    else()
+        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+    endif()
+endif()
+
 if (WHISPER_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@ -237,20 +281,25 @@ message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")

 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
+elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+    message(STATUS "PowerPC detected")
 else()
    message(STATUS "x86 detected")
    if (MSVC)
-            if(NOT WHISPER_NO_AVX2)
-                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-            else()
-                if(NOT WHISPER_NO_AVX)
-                    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-                    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
-                    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
-                endif()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
+        if(NOT WHISPER_NO_AVX2)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
+        else()
+            if(NOT WHISPER_NO_AVX)
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
            endif()
+        endif()
    else()
        if (EMSCRIPTEN)
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
--- a/171
+++ b/171
@ -12,6 +12,12 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+ifndef NVCC_VERSION
+	ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
+		NVCC_VERSION := $(shell nvcc --version | egrep -o "V[0-9]+.[0-9]+.[0-9]+" | cut -c2-)
+	endif
+endif
+
 CCV := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)

@ -51,19 +57,7 @@ endif

 # OS specific
 # TODO: support Windows
-ifeq ($(UNAME_S),Linux)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Darwin)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),FreeBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Haiku)
+ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$(UNAME_S))
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
@ -71,66 +65,56 @@ endif
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS += -mf16c
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
+		CPUINFO_CMD := sysctl machdep.cpu.features
 	else ifeq ($(UNAME_S),Linux)
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-
-			AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-			ifneq (,$(findstring avx,$(AVX1_M)))
-				CFLAGS += -mavx
-			endif
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
+		CPUINFO_CMD := cat /proc/cpuinfo
+	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
+		CPUINFO_CMD := cat /proc/cpuinfo
+	else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
+		CPUINFO_CMD := grep Features /var/run/dmesg.boot
 	else ifeq ($(UNAME_S),Haiku)
-		AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep "FMA ")
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep "F16C ")
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-
-			AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
-			ifneq (,$(findstring avx,$(AVX1_M)))
-				CFLAGS += -mavx
-			endif
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
+		CPUINFO_CMD := sysinfo -cpu
+	endif
+
+	ifdef CPUINFO_CMD
+		AVX_M := $(shell $(CPUINFO_CMD) | grep -iwE 'AVX|AVX1.0')
+		ifneq (,$(AVX_M))
+			CFLAGS   += -mavx
+			CXXFLAGS += -mavx
+		endif
+
+		AVX2_M := $(shell $(CPUINFO_CMD) | grep -iw 'AVX2')
+		ifneq (,$(AVX2_M))
+			CFLAGS   += -mavx2
+			CXXFLAGS += -mavx2
+		endif
+
+		FMA_M := $(shell $(CPUINFO_CMD) | grep -iw 'FMA')
+		ifneq (,$(FMA_M))
+			CFLAGS   += -mfma
+			CXXFLAGS += -mfma
+		endif
+
+		F16C_M := $(shell $(CPUINFO_CMD) | grep -iw 'F16C')
+		ifneq (,$(F16C_M))
+			CFLAGS   += -mf16c
+			CXXFLAGS += -mf16c
+		endif
+
+		SSE3_M := $(shell $(CPUINFO_CMD) | grep -iwE 'PNI|SSE3')
+		ifneq (,$(SSE3_M))
+			CFLAGS   += -msse3
+			CXXFLAGS += -msse3
+		endif
+
+		SSSE3_M := $(shell $(CPUINFO_CMD) | grep -iw 'SSSE3')
+		ifneq (,$(SSSE3_M))
+			CFLAGS   += -mssse3
+			CXXFLAGS += -mssse3
+		endif
 	endif
-endif
-ifeq ($(UNAME_M),amd64)
-	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif

 ifneq ($(filter ppc64%,$(UNAME_M)),)
@ -162,29 +146,56 @@ endif
 endif

 ifdef WHISPER_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
 endif

 ifdef WHISPER_CUBLAS
+	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
+		CUDA_ARCH_FLAG=native
+	else
+		CUDA_ARCH_FLAG=all
+	endif
+
 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
-	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=any
+	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif

+ifdef WHISPER_HIPBLAS
+	ROCM_PATH   ?= /opt/rocm
+	HIPCC       ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	LDFLAGS     += -lhipblas -lamdhip64 -lrocblas
+	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	WHISPER_OBJ += ggml-cuda.o
+
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif
+
 ifdef WHISPER_CLBLAST
 	CFLAGS 		+= -DGGML_USE_CLBLAST
-	LDFLAGS	 	+= -lclblast -lOpenCL
+	CXXFLAGS 	+= -DGGML_USE_CLBLAST
+	LDFLAGS	 	+= -lclblast
+	ifeq ($(UNAME_S),Darwin)
+		LDFLAGS	 	+= -framework OpenCL
+	else
+		LDFLAGS	    += -lOpenCL
+	endif
 	WHISPER_OBJ	+= ggml-opencl.o

 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif

 ifdef WHISPER_GPROF
@ -262,7 +273,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so

 #
 # Examples
@ -286,8 +297,11 @@ quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+
+lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)

 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
@ -309,6 +323,7 @@ samples:
 	@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
 	@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
 	@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
+	@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
 	@echo "Converting to 16-bit WAV ..."
 	@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
 	@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ -318,6 +333,8 @@ samples:
 	@rm samples/mm1.wav
 	@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
 	@rm samples/a13.mp3
+	@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
+	@rm samples/diffusion2023-07-03.flac

 #
 # Models
@ -359,4 +376,4 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main

 .PHONY: tests
 tests:
-	bash ./tests/run-tests.sh
+	bash ./tests/run-tests.sh $(word 2, $(MAKECMDGOALS))
--- a/README.md
+++ b/README.md
@ -22,6 +22,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
 - [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
+- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)

 Supported platforms:
@ -60,7 +61,7 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 - Various other examples are available in the [examples](examples) folder

 The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
-instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
+intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.

 ## Quick start
@ -286,8 +287,8 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
  WHISPER_COREML=1 make -j

  # using CMake
-  cd build
-  cmake -DWHISPER_COREML=1 ..
+  cmake -B build -DWHISPER_COREML=1
+  cmake --build build -j --config Release
  ```

 - Run the examples as usual. For example:
@ -311,6 +312,85 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

 For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).

+## OpenVINO support
+
+On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
+on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
+
+This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
+
+- First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
+
+  Windows:
+  ```
+  cd models
+  python -m venv openvino_conv_env
+  openvino_conv_env\Scripts\activate
+  python -m pip install --upgrade pip
+  pip install -r openvino-conversion-requirements.txt
+  ```
+
+  Linux and macOS:
+  ```
+  cd models
+  python3 -m venv openvino_conv_env
+  source openvino_conv_env/bin/activate
+  python -m pip install --upgrade pip
+  pip install -r openvino-conversion-requirements.txt
+  ```
+
+- Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
+
+  ```
+  python convert-whisper-to-openvino.py --model base.en
+  ```
+
+  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
+  is the default location that the OpenVINO extension will search at runtime.
+
+- Build `whisper.cpp` with OpenVINO support:
+
+  Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
+
+  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
+
+  Linux:
+  ```bash
+  source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
+  ```
+
+  Windows (cmd):
+  ```
+  C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
+  ```
+
+  And then build the project using cmake:
+  ```bash
+  cmake -B build -DWHISPER_OPENVINO=1
+  cmake --build build -j --config Release
+  ```
+
+- Run the examples as usual. For example:
+  ```bash
+  ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+
+  ...
+
+  whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
+  whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
+  whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
+  whisper_ctx_init_openvino_encoder: OpenVINO model loaded
+
+  system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
+
+  ...
+  ```
+
+  The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
+  cached for the next run.
+  
+For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
+
 ## NVIDIA GPU support via cuBLAS

 With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS.
@ -338,11 +418,9 @@ make clean
 WHISPER_CLBLAST=1 make -j

 CMake:
-cd whisper.cpp ; mkdir build ; cd build
-cmake -DWHISPER_CLBLAST=ON  ..
-make clean
-make -j
-cp bin/* ../ 
+cd whisper.cpp
+cmake -B build -DWHISPER_CLBLAST=ON
+cmake --build build -j --config Release
 ```


--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -19,6 +19,10 @@ func (p *Params) SetTranslate(v bool) {
 	p.translate = toBool(v)
 }

+func (p *Params) SetSplitOnWord(v bool) {
+	p.split_on_word = toBool(v)
+}
+
 func (p *Params) SetNoContext(v bool) {
 	p.no_context = toBool(v)
 }
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -81,6 +81,10 @@ func (context *context) SetSpeedup(v bool) {
 	context.params.SetSpeedup(v)
 }

+func (context *context) SetSplitOnWord(v bool) {
+        context.params.SetSplitOnWord(v)
+}
+
 // Set number of threads to use
 func (context *context) SetThreads(v uint) {
 	context.params.SetThreads(int(v))
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -42,6 +42,7 @@ type Context interface {
 	SetDuration(time.Duration)    // Set duration
 	SetThreads(uint)              // Set number of threads to use
 	SetSpeedup(bool)              // Set speedup flag
+	SetSplitOnWord(bool)          // Set split on word flag
 	SetTokenThreshold(float32)    // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
 	SetMaxSegmentLength(uint)     // Set max segment length in characters
--- a/bindings/javascript/libwhisper.worker.js
+++ b/bindings/javascript/libwhisper.worker.js
@ -1 +1 @@
-"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:function(f){(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f)},postMessage:function(msg){parentPort.postMessage(msg)},performance:global.performance||{now:function(){return Date.now()}}})}var initializedJS=false;var pendingNotifiedProxyingQueues=[];function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var instance=new WebAssembly.Instance(Module["wasmModule"],info);receiveInstance(instance);Module["wasmModule"]=null;return instance.exports};self.onunhandledrejection=e=>{throw e.reason??e};self.onmessage=e=>{try{if(e.data.cmd==="load"){Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=function(){postMessage({cmd:"callHandler",handler:handler,args:[...arguments]})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module).then(function(instance){Module=instance})}else if(e.data.cmd==="run"){Module["__performance_now_clock_drift"]=performance.now()-e.data.time;Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();pendingNotifiedProxyingQueues.forEach(queue=>{Module["executeNotifiedProxyingQueue"](queue)});pendingNotifiedProxyingQueues=[];initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){if(ex instanceof Module["ExitStatus"]){if(Module["keepRuntimeAlive"]()){}else{Module["__emscripten_thread_exit"](ex.status)}}else{throw ex}}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="processProxyingQueue"){if(initializedJS){Module["executeNotifiedProxyingQueue"](e.data.queue)}else{pendingNotifiedProxyingQueues.push(e.data.queue)}}else if(e.data.cmd){err("worker.js received unknown command "+e.data.cmd);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}};
+"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:f=>(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f),postMessage:msg=>parentPort.postMessage(msg),performance:global.performance||{now:Date.now}})}var initializedJS=false;function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var module=Module["wasmModule"];Module["wasmModule"]=null;var instance=new WebAssembly.Instance(module,info);return receiveInstance(instance)};self.onunhandledrejection=e=>{throw e.reason||e};function handleMessage(e){try{if(e.data.cmd==="load"){let messageQueue=[];self.onmessage=e=>messageQueue.push(e);self.startWorker=instance=>{Module=instance;postMessage({"cmd":"loaded"});for(let msg of messageQueue){handleMessage(msg)}self.onmessage=handleMessage};Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=(...args)=>{postMessage({cmd:"callHandler",handler:handler,args:args})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module)}else if(e.data.cmd==="run"){Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["__emscripten_thread_mailbox_await"](e.data.pthread_ptr);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){throw ex}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="checkMailbox"){if(initializedJS){Module["checkMailbox"]()}}else if(e.data.cmd){err(`worker.js received unknown command ${e.data.cmd}`);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}}self.onmessage=handleMessage;
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/coreml/whisper-decoder-impl.h
+++ b/coreml/whisper-decoder-impl.h
@ -31,10 +31,10 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>

-/// var_1195 as multidimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * var_1195;
+/// var_1346 as multidimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
 - (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 NS_DESIGNATED_INITIALIZER;
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;

@end

--- a/coreml/whisper-decoder-impl.m
+++ b/coreml/whisper-decoder-impl.m
@ -39,21 +39,21 @@

@implementation whisper_decoder_implOutput

- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 {
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
    self = [super init];
    if (self) {
-        _var_1195 = var_1195;
+        _var_1346 = var_1346;
    }
    return self;
 }

 - (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"var_1195"]];
+    return [NSSet setWithArray:@[@"var_1346"]];
 }

 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"var_1195"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.var_1195];
+    if ([featureName isEqualToString:@"var_1346"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
    }
    return nil;
 }
@ -177,7 +177,7 @@
 - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
    if (!outFeatures) { return nil; }
-    return [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[outFeatures featureValueForName:@"var_1195"].multiArrayValue];
+    return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
 }

 - (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
@ -192,7 +192,7 @@
    NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
    for (NSInteger i = 0; i < outBatch.count; i++) {
        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[resultProvider featureValueForName:@"var_1195"].multiArrayValue];
+        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
        [results addObject:result];
    }
    return results;
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -53,9 +53,11 @@ void whisper_coreml_encode(
                                           error: nil
    ];

-    whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
+    @autoreleasepool {
+        whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];

-    memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
+        memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
+    }
 }

 #if __cplusplus
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,7 @@ add_library(${TARGET} STATIC
    common.cpp
    common-ggml.h
    common-ggml.cpp
+    grammar-parser.cpp
    )

 include(DefaultTargetOptions)
@ -69,4 +70,5 @@ else()
    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
+    add_subdirectory(lsp)
 endif()
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -9,6 +9,7 @@
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
+#include "grammar-parser.h"

 #include <sstream>
 #include <cassert>
@ -21,6 +22,11 @@
 #include <vector>
 #include <map>

+bool file_exists(const std::string & fname) {
+    std::ifstream f(fname.c_str());
+    return f.good();
+}
+
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -30,8 +36,12 @@ struct whisper_params {
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;

-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
+    float vad_thold  = 0.6f;
+    float freq_thold = 100.0f;
+
+    float grammar_penalty = 100.0f;
+
+    grammar_parser::parse_state grammar_parsed;

    bool speed_up      = false;
    bool translate     = false;
@ -44,6 +54,8 @@ struct whisper_params {
    std::string fname_out;
    std::string commands;
    std::string prompt;
+    std::string context;
+    std::string grammar;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -73,6 +85,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-cmd" || arg == "--commands")      { params.commands      = argv[++i]; }
        else if (arg == "-p"   || arg == "--prompt")        { params.prompt        = argv[++i]; }
+        else if (arg == "-ctx" || arg == "--context")       { params.context       = argv[++i]; }
+        else if (                 arg == "--grammar")       { params.grammar       = argv[++i]; }
+        else if (                 arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -106,16 +121,30 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "  -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n",             params.commands.c_str());
    fprintf(stderr, "  -p,         --prompt         [%-7s] the required activation prompt\n",              params.prompt.c_str());
+    fprintf(stderr, "  -ctx,       --context        [%-7s] sample text to help the transcription\n",       params.context.c_str());
+    fprintf(stderr, "  --grammar GRAMMAR            [%-7s] GBNF grammar to guide decoding\n",              params.grammar.c_str());
+    fprintf(stderr, "  --grammar-penalty N          [%-7.1f] scales down logits of nongrammar tokens\n",   params.grammar_penalty);
    fprintf(stderr, "\n");
 }

-std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+std::string transcribe(
+                 whisper_context * ctx,
+            const whisper_params & params,
+        const std::vector<float> & pcmf32,
+               const std::string & grammar_rule,
+                           float & logprob_min,
+                           float & logprob_sum,
+                             int & n_tokens,
+                         int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

-    prob = 0.0f;
+    logprob_min = 0.0f;
+    logprob_sum = 0.0f;
+    n_tokens    = 0;
    t_ms = 0;

-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+    //whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);

    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
@ -123,19 +152,37 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
+    wparams.no_timestamps    = params.no_timestamps;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;

-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
+    wparams.audio_ctx = params.audio_ctx;
+    wparams.speed_up  = params.speed_up;
+
+    wparams.temperature     = 0.4f;
+    wparams.temperature_inc = 1.0f;
+    wparams.greedy.best_of  = 5;
+
+    wparams.beam_search.beam_size = 5;
+
+    wparams.initial_prompt = params.context.data();
+
+    const auto & grammar_parsed = params.grammar_parsed;
+    auto grammar_rules = grammar_parsed.c_rules();
+
+    if (!params.grammar_parsed.rules.empty() && !grammar_rule.empty()) {
+        wparams.grammar_rules   = grammar_rules.data();
+        wparams.n_grammar_rules = grammar_rules.size();
+        wparams.i_start_rule    = grammar_parsed.symbol_ids.at(grammar_rule);
+        wparams.grammar_penalty = params.grammar_penalty;
+    }

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }

-    int prob_n = 0;
    std::string result;

    const int n_segments = whisper_full_n_segments(ctx);
@ -144,19 +191,17 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con

        result += text;

-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
+        const int n = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);

-            prob += token.p;
-            ++prob_n;
+            if(token.plog > 0.0f) exit(0);
+            logprob_min = std::min(logprob_min, token.plog);
+            logprob_sum += token.plog;
+            ++n_tokens;
        }
    }

-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

@ -247,7 +292,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
        fprintf(stderr, " ]\n");
    }

-    std::string  k_prompt = "select one from the available words: ";
+    std::string k_prompt = "select one from the available words: ";
    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
        if (i > 0) {
            k_prompt += ", ";
@ -415,7 +460,9 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
    bool is_running = true;
    bool ask_prompt = true;

-    float prob = 0.0f;
+    float logprob_min = 0.0f;
+    float logprob_sum = 0.0f;
+    int   n_tokens    = 0;

    std::vector<float> pcmf32_cur;

@ -453,7 +500,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
                // detect the commands
                audio.get(params.command_ms, pcmf32_cur);

-                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
+                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "", logprob_min, logprob_sum, n_tokens, t_ms));

                const auto words = get_words(txt);

@ -489,18 +536,27 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi

 // general-purpose mode
 // freely transcribe the voice into text
-int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;

-    float prob0 = 0.0f;
-    float prob  = 0.0f;
+    float logprob_min0 = 0.0f;
+    float logprob_min  = 0.0f;
+
+    float logprob_sum0 = 0.0f;
+    float logprob_sum  = 0.0f;
+
+    int n_tokens0 = 0;
+    int n_tokens  = 0;

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string k_prompt = "Ok Whisper, start listening for commands.";
+    std::string k_prompt = "Ok Whisper, start listening for commands.";
+    if (!params.prompt.empty()) {
+        k_prompt = params.prompt;
+    }

    fprintf(stderr, "\n");
    fprintf(stderr, "%s: general-purpose mode\n", __func__);
@ -533,9 +589,11 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
                    // wait for activation phrase
                    audio.get(params.prompt_ms, pcmf32_cur);

-                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "prompt", logprob_min0, logprob_sum0, n_tokens0, t_ms));

-                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+                    const float p = 100.0f * std::exp(logprob_min0);
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms, p = %.2f%%)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms, p);

                    const float sim = similarity(txt, k_prompt);

@ -556,19 +614,30 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
                    // we have heard the activation phrase, now detect the commands
                    audio.get(params.command_ms, pcmf32_cur);

+                    //printf("len prompt:  %.4f\n", pcmf32_prompt.size() / (float) WHISPER_SAMPLE_RATE);
+                    //printf("len command: %.4f\n", pcmf32_cur.size() / (float) WHISPER_SAMPLE_RATE);
+
+                    // prepend 3 second of silence
+                    pcmf32_cur.insert(pcmf32_cur.begin(), 3.0f*WHISPER_SAMPLE_RATE, 0.0f);
+
                    // prepend the prompt audio
                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());

-                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "root", logprob_min, logprob_sum, n_tokens, t_ms));

-                    prob = 100.0f*(prob - prob0);
+                    //const float p = 100.0f * std::exp((logprob - logprob0) / (n_tokens - n_tokens0));
+                    const float p = 100.0f * std::exp(logprob_min);

                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());

                    // find the prompt in the text
                    float best_sim = 0.0f;
                    size_t best_len = 0;
-                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                    for (size_t n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        if (n >= txt.size()) {
+                            break;
+                        }
+
                        const auto prompt = txt.substr(0, n);

                        const float sim = similarity(prompt, k_prompt);
@ -581,9 +650,16 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
                        }
                    }

-                    const std::string command = ::trim(txt.substr(best_len));
+                    fprintf(stdout, "%s:   DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
+                    if (best_len == 0) {
+                        fprintf(stdout, "%s: WARNING: command not recognized, try again\n", __func__);
+                    } else {
+                        // cut the prompt from the decoded text
+                        const std::string command = ::trim(txt.substr(best_len));
+
+                        fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    }

-                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                    fprintf(stdout, "\n");
                }

@ -648,12 +724,36 @@ int main(int argc, char ** argv) {

    int  ret_val = 0;

-    if (!params.commands.empty()) {
-        ret_val = process_command_list(ctx, audio, params);
-    } else if (!params.prompt.empty()) {
-        ret_val = always_prompt_transcription(ctx, audio, params);
-    } else {
-        ret_val = process_general_transcription(ctx, audio, params);
+    if (!params.grammar.empty()) {
+        auto & grammar = params.grammar_parsed;
+        if (file_exists(params.grammar.c_str())) {
+            // read grammar from file
+            std::ifstream ifs(params.grammar.c_str());
+            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
+            grammar = grammar_parser::parse(txt.c_str());
+        } else {
+            // read grammar from string
+            grammar = grammar_parser::parse(params.grammar.c_str());
+        }
+
+        // will be empty (default) if there are parse errors
+        if (grammar.rules.empty()) {
+            ret_val = 1;
+        } else {
+            fprintf(stderr, "%s: grammar:\n", __func__);
+            grammar_parser::print_grammar(stderr, grammar);
+            fprintf(stderr, "\n");
+        }
+    }
+
+    if (ret_val == 0) {
+        if (!params.commands.empty()) {
+            ret_val = process_command_list(ctx, audio, params);
+        } else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
+            ret_val = always_prompt_transcription(ctx, audio, params);
+        } else {
+            ret_val = process_general_transcription(ctx, audio, params);
+        }
    }

    audio.pause();
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -1,3 +1,5 @@
+#define _USE_MATH_DEFINES // for M_PI
+
 #include "common.h"

 // third-party utilities
@ -13,53 +15,59 @@
 #include <codecvt>
 #include <sstream>

-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+// Function to check if the next argument exists
+std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
+    if (i + 1 < argc && argv[i + 1][0] != '-') {
+        return argv[++i];
+    } else {
+        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
+        gpt_print_usage(argc, argv, params);
+        exit(0);
+    }
+}
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
+            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
+            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
+            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
+            params.prompt = get_next_arg(i, argc, argv, arg, params);
        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
+            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--top_k") {
-            params.top_k = std::max(1, std::stoi(argv[++i]));
+            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
+            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
+            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stof(argv[++i]);
+            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
+            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
+            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
+            params.model = get_next_arg(i, argc, argv, arg, params);
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "-ip" || arg == "--interactive-port") {
            params.interactive = true;
-            params.interactive_port = std::stoi(argv[++i]);
+            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
        } else if (arg == "-f" || arg == "--file") {
-            if (++i > argc) {
-                fprintf(stderr, "Invalid file param");
-                break;
-            }
+            get_next_arg(i, argc, argv, arg, params);
            std::ifstream file(argv[i]);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@ -70,7 +78,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                params.prompt.pop_back();
            }
        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = argv[++i];
+            params.token_test = get_next_arg(i, argc, argv, arg, params);
        }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -89,6 +97,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
@ -755,3 +764,46 @@ float similarity(const std::string & s0, const std::string & s1) {

    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-i" || arg == "--inp") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-o" || arg == "--out") {
+            params.fname_out = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
+    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "  -o FNAME, --out FNAME\n");
+    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
--- a/examples/common.h
+++ b/examples/common.h
@ -11,7 +11,7 @@
 #define COMMON_SAMPLE_RATE 16000

 //
-// CLI argument parsing
+// GPT CLI argument parsing
 //

 struct gpt_params {
@ -33,6 +33,8 @@ struct gpt_params {

    bool    interactive      = false;
    int32_t interactive_port = -1;
+
+    int32_t n_gpu_layers     = 0;
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@ -155,3 +157,20 @@ bool vad_simple(

 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1);
+
+//
+// SAM argument parsing
+//
+
+struct sam_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
+    std::string fname_inp = "img.jpg";
+    std::string fname_out = "img.out";
+};
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params);
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params);
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -0,0 +1,423 @@
+#include "grammar-parser.h"
+#include <cstdint>
+#include <cwchar>
+#include <string>
+#include <utility>
+#include <stdexcept>
+#include <exception>
+
+namespace grammar_parser {
+    // NOTE: assumes valid utf8 (but checks for overrun)
+    // copied from whisper.cpp
+    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+        uint8_t  first_byte = static_cast<uint8_t>(*src);
+        uint8_t  highbits   = first_byte >> 4;
+        int      len        = lookup[highbits];
+        uint8_t  mask       = (1 << (8 - len)) - 1;
+        uint32_t value      = first_byte & mask;
+        const char * end    = src + len; // may overrun!
+        const char * pos    = src + 1;
+        for ( ; pos < end && *pos; pos++) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
+        return result.first->second;
+    }
+
+    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+        return next_id;
+    }
+
+    void add_rule(
+            parse_state & state,
+            uint32_t      rule_id,
+            const std::vector<whisper_grammar_element> & rule) {
+        if (state.rules.size() <= rule_id) {
+            state.rules.resize(rule_id + 1);
+        }
+        state.rules[rule_id] = rule;
+    }
+
+    bool is_word_char(char c) {
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+    }
+
+    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+        const char * pos   = src;
+        const char * end   = src + size;
+        uint32_t     value = 0;
+        for ( ; pos < end && *pos; pos++) {
+            value <<= 4;
+            char c = *pos;
+            if ('a' <= c && c <= 'f') {
+                value += c - 'a' + 10;
+            } else if ('A' <= c && c <= 'F') {
+                value += c - 'A' + 10;
+            } else if ('0' <= c && c <= '9') {
+                value += c - '0';
+            } else {
+                break;
+            }
+        }
+        if (pos != end) {
+            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    const char * parse_space(const char * src, bool newline_ok) {
+        const char * pos = src;
+        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+            if (*pos == '#') {
+                while (*pos && *pos != '\r' && *pos != '\n') {
+                    pos++;
+                }
+            } else {
+                pos++;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_name(const char * src) {
+        const char * pos = src;
+        while (is_word_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting name at ") + src);
+        }
+        return pos;
+    }
+
+    std::pair<uint32_t, const char *> parse_char(const char * src) {
+        if (*src == '\\') {
+            switch (src[1]) {
+                case 'x': return parse_hex(src + 2, 2);
+                case 'u': return parse_hex(src + 2, 4);
+                case 'U': return parse_hex(src + 2, 8);
+                case 't': return std::make_pair('\t', src + 2);
+                case 'r': return std::make_pair('\r', src + 2);
+                case 'n': return std::make_pair('\n', src + 2);
+                case '\\':
+                case '"':
+                case '[':
+                case ']':
+                    return std::make_pair(src[1], src + 2);
+                default:
+                    throw std::runtime_error(std::string("unknown escape at ") + src);
+            }
+        } else if (*src) {
+            return decode_utf8(src);
+        }
+        throw std::runtime_error("unexpected end of input");
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            parse_state                        & state,
+            const char                         * src,
+            const std::string                  & rule_name,
+            std::vector<whisper_grammar_element> & out_elements,
+            bool                                 is_nested) {
+        size_t last_sym_start = out_elements.size();
+        const char * pos = src;
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = out_elements.size();
+                while (*pos != '"') {
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    out_elements.push_back({WHISPER_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum whisper_gretype start_type = WHISPER_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = WHISPER_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = out_elements.size();
+                while (*pos != ']') {
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum whisper_gretype type = last_sym_start < out_elements.size()
+                        ? WHISPER_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    out_elements.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        out_elements.push_back({WHISPER_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = out_elements.size();
+                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
+                last_sym_start = out_elements.size();
+                // output reference to synthesized rule
+                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+                if (last_sym_start == out_elements.size()) {
+                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                }
+
+                // apply transformation to previous symbol (last_sym_start to end) according to
+                // rewrite rules:
+                // S* --> S' ::= S S' |
+                // S+ --> S' ::= S S' | S
+                // S? --> S' ::= S |
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                std::vector<whisper_grammar_element> sub_rule;
+                // add preceding symbol to generated rule
+                sub_rule.insert(
+                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                if (*pos == '*' || *pos == '+') {
+                    // cause generated rule to recurse
+                    sub_rule.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
+                }
+                // mark start of alternate def
+                sub_rule.push_back({WHISPER_GRETYPE_ALT, 0});
+                if (*pos == '+') {
+                    // add preceding symbol as alternate only for '+' (otherwise empty)
+                    sub_rule.insert(
+                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                }
+                sub_rule.push_back({WHISPER_GRETYPE_END, 0});
+                add_rule(state, sub_rule_id, sub_rule);
+
+                // in original rule, replace previous symbol with reference to generated rule
+                out_elements.resize(last_sym_start);
+                out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
+
+                pos = parse_space(pos + 1, is_nested);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested) {
+        std::vector<whisper_grammar_element> rule;
+        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
+        while (*pos == '|') {
+            rule.push_back({WHISPER_GRETYPE_ALT, 0});
+            pos = parse_space(pos + 1, true);
+            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
+        }
+        rule.push_back({WHISPER_GRETYPE_END, 0});
+        add_rule(state, rule_id, rule);
+        return pos;
+    }
+
+    const char * parse_rule(parse_state & state, const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(state, pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+    parse_state parse(const char * src) {
+        try {
+            parse_state state;
+            const char * pos = parse_space(src, true);
+            while (*pos) {
+                pos = parse_rule(state, pos);
+            }
+            return state;
+        } catch (const std::exception & err) {
+            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+            return parse_state();
+        }
+    }
+
+    void print_grammar_char(FILE * file, uint32_t c) {
+        if (0x20 <= c && c <= 0x7f) {
+            fprintf(file, "%c", static_cast<char>(c));
+        } else {
+            // cop out of encoding UTF-8
+            fprintf(file, "<U+%04X>", c);
+        }
+    }
+
+    bool is_char_element(whisper_grammar_element elem) {
+        switch (elem.type) {
+            case WHISPER_GRETYPE_CHAR:           return true;
+            case WHISPER_GRETYPE_CHAR_NOT:       return true;
+            case WHISPER_GRETYPE_CHAR_ALT:       return true;
+            case WHISPER_GRETYPE_CHAR_RNG_UPPER: return true;
+            default:                           return false;
+        }
+    }
+
+    void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
+        for (auto elem : rule) {
+            switch (elem.type) {
+                case WHISPER_GRETYPE_END:            fprintf(file, "END");            break;
+                case WHISPER_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+                case WHISPER_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+                case WHISPER_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+                case WHISPER_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+                case WHISPER_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+                case WHISPER_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            }
+            switch (elem.type) {
+                case WHISPER_GRETYPE_END:
+                case WHISPER_GRETYPE_ALT:
+                case WHISPER_GRETYPE_RULE_REF:
+                    fprintf(file, "(%u) ", elem.value);
+                    break;
+                case WHISPER_GRETYPE_CHAR:
+                case WHISPER_GRETYPE_CHAR_NOT:
+                case WHISPER_GRETYPE_CHAR_RNG_UPPER:
+                case WHISPER_GRETYPE_CHAR_ALT:
+                    fprintf(file, "(\"");
+                    print_grammar_char(file, elem.value);
+                    fprintf(file, "\") ");
+                    break;
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_rule(
+            FILE     * file,
+            uint32_t   rule_id,
+            const std::vector<whisper_grammar_element> & rule,
+            const std::map<uint32_t, std::string>    & symbol_id_names) {
+        if (rule.empty() || rule.back().type != WHISPER_GRETYPE_END) {
+            throw std::runtime_error(
+                "malformed rule, does not end with WHISPER_GRETYPE_END: " + std::to_string(rule_id));
+        }
+        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+            whisper_grammar_element elem = rule[i];
+            switch (elem.type) {
+                case WHISPER_GRETYPE_END:
+                    throw std::runtime_error(
+                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                        std::to_string(i));
+                case WHISPER_GRETYPE_ALT:
+                    fprintf(file, "| ");
+                    break;
+                case WHISPER_GRETYPE_RULE_REF:
+                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                    break;
+                case WHISPER_GRETYPE_CHAR:
+                    fprintf(file, "[");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case WHISPER_GRETYPE_CHAR_NOT:
+                    fprintf(file, "[^");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case WHISPER_GRETYPE_CHAR_RNG_UPPER:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "WHISPER_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    fprintf(file, "-");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case WHISPER_GRETYPE_CHAR_ALT:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "WHISPER_GRETYPE_CHAR_ALT without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    print_grammar_char(file, elem.value);
+                    break;
+            }
+            if (is_char_element(elem)) {
+                switch (rule[i + 1].type) {
+                    case WHISPER_GRETYPE_CHAR_ALT:
+                    case WHISPER_GRETYPE_CHAR_RNG_UPPER:
+                        break;
+                    default:
+                        fprintf(file, "] ");
+                }
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_grammar(FILE * file, const parse_state & state) {
+        try {
+            std::map<uint32_t, std::string> symbol_id_names;
+            for (auto kv : state.symbol_ids) {
+                symbol_id_names[kv.second] = kv.first;
+            }
+            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
+                // fprintf(file, "%zu: ", i);
+                // print_rule_binary(file, state.rules[i]);
+                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
+                // fprintf(file, "\n");
+            }
+        } catch (const std::exception & err) {
+            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+        }
+    }
+
+    std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
+        std::vector<const whisper_grammar_element *> ret;
+        for (const auto & rule : rules) {
+            ret.push_back(rule.data());
+        }
+        return ret;
+    }
+}
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by whisper.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root  ::= expr
+// expr  ::= term ([-+*/] term)*
+// term  ::= num | "(" space expr ")" space
+// num   ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "whisper.h"
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <string>
+
+namespace grammar_parser {
+    struct parse_state {
+        std::map<std::string, uint32_t>                   symbol_ids;
+        std::vector<std::vector<whisper_grammar_element>> rules;
+
+        std::vector<const whisper_grammar_element *>      c_rules() const;
+    };
+
+    parse_state parse(const char * src);
+    void print_grammar(FILE * file, const parse_state & state);
+}
--- a/examples/lsp/CMakeLists.txt
+++ b/examples/lsp/CMakeLists.txt
@ -0,0 +1,9 @@
+if (WHISPER_SDL2)
+    # stream
+    set(TARGET lsp)
+    add_executable(${TARGET} lsp.cpp)
+
+    include(DefaultTargetOptions)
+
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/lsp/README.md
+++ b/examples/lsp/README.md
@ -0,0 +1,104 @@
+# Language Server
+
+This example consists of a simple language server to expose both unguided
+and guided (command) transcriptions by sending json messages over stdout/stdin
+as well as a rather robust vim plugin that makes use of the language server.
+
+## Vim plugin quick start
+
+Compile the language server with
+
+```bash
+make lsp
+```
+Install the plugin itself by copying or symlinking whisper.vim into ~/.vim/autoload/
+
+In your vimrc, set the path of your whisper.cpp directory and optionally add some keybinds.
+
+```vim
+let g:whisper_dir = "~/whisper.cpp"
+" Start listening for commands when Ctrl - g is pressed in normal mode
+nnoremap <C-G> call whisper#requestCommands()<CR>
+" Start unguided transcription when Ctrl - g is pressed in insert mode
+inoremap <C-G> <Cmd>call whisper#doTranscription()<CR>
+```
+
+## Vim plugin usage
+
+The vim plugin was designed to closely follow the mnemonics of vim
+
+`s:spoken_dict` is used to translate keys to their spoken form.
+
+
+Keys corresponding to a string use that spoken value normally and when a motion is expected, but use the key itself when a character is expected.  
+Keys corresponding to a dict, like `i`, can have manual difinitions given to each possible commandset.
+
+0 is normal (insert), 1 is motion (inside), 2 is it's usage as a single key ([till] i), and 3 is it's usage in an area selection (s -> [around] sentence)
+
+Some punctuation items, like `-` are explicitly given pronunciations to prevent them from being picked as punctuation instead of an actual command word.
+
+Not all commands will tokenize to a single token and this can interfere with interpretation. "yank" as an example, takes multiple tokens and correspondingly, will give more accurate detection when only the first "ya" is used. While it could be changed to something else that is a single token (copy), value was placed on maintaining vim mnemonics.
+
+Commands that would normally move the editor into insert mode (insert, append, open, change) will begin unguided transcription.
+Unguided transcription will end when a speech segment ends in exit.
+Presence of punctuation can be designated by whether or not you add a pause between the previous speech segment and exit.
+Exiting only occurs if exit is the last word, so "Take the first exit on your right" would not cause transcription to end.
+
+After a command is evaluated, the plugin will continue listening for the next command.
+
+While in command mode, "Exit" will end listening.
+
+A best effort approach is taken to keep track of audio that is recorded while a previous chunk is still processing and immediately interpret it afterwards, but the current voice detection still needs a fairly sizable gap to determine when a command has been spoken.
+
+Log information is sent to a special `whisper_log` buffer and can be accessed with
+```vim
+:e whisper_log
+```
+
+## Vim plugin configuration
+
+`g:whisper_dir`  
+A full path to the whisper.cpp repo. It can be expanded in the definition like so:
+```vim
+let g:whisper_dir = expand("~/whisper.cpp/")
+```
+(The WHISPER_CPP_HOME environment variable is also checked for users of the existing whisper.nvim script)
+
+`g:whisper_lsp_path`  
+Can be used to manually set the path to the language server.
+If not defined, it will be inferred from the above whisper_dir
+
+`g:whisper_model_path`  
+A full path to the model to load. If not defined, it will default to ggml-base.en.bin
+
+`g:whisper_user_commands`  
+A dictionary of spoken commands that correspond to either strings or funcrefs.
+This can be used to create connections with other user plugins, for example
+```vim
+let g:whisper_user_commands = {"gen": "llama#doLlamaGen"}
+```
+will trigger the llama.cpp plugin to begin generation when "gen" is spoken
+
+## Language server methods
+
+`registerCommandset`  
+`params` is a list of strings that should be checked for with this commandset. The server prepends a space to these strings before tokenizing.  
+Responds with  
+`result.index` an integer index for the commandset registered, which should be included when initiating a guided transcription to select this commandset.
+Will return an error if any of the commands in the commandset have duplicate tokenizations
+
+`guided`  
+`params.commandset_index` An index returned by a corresponding commandset registration. If not set, the most recently registered commandset is used.
+`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.  
+Responds with  
+`result.command_index` The numerical index (starting from 0) of the detected command in the selected commandset
+`result.command_text` A string containing the command as provided in the commandset
+`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
+
+`unguided`  
+`params.no_context` Sets the corresponding whisper `no_context` param. Defaults to true. Might provide more accurate results for consecutive unguided transcriptions if those after the first are set to false.
+`params.prompt` If provided, sets the initial prompt used during transcription.
+`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.  
+Responds with  
+`result.transcription` A string containing the transcribed text.  N.B. This will almost always start with a space due to how text is tokenized.
+`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
--- a/examples/lsp/json.hpp
+++ b/examples/lsp/json.hpp
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -0,0 +1,458 @@
+#include "common.h"
+#include "common-sdl.h"
+#include "whisper.h"
+#include "json.hpp"
+
+#include <iostream>
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+#include <deque>
+#include <set>
+
+using json = nlohmann::json;
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t prompt_ms  = 5000;
+    int32_t command_ms = 8000;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;
+
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
+    bool speed_up      = false;
+    bool translate     = false;
+    bool print_special = false;
+    bool print_energy  = false;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+};
+struct command {
+    std::vector<whisper_token> tokens;
+    std::string plaintext;
+};
+struct commandset {
+    std::vector<struct command> commands;
+    std::vector<whisper_token> prompt_tokens;
+    // TODO: Store longest command?
+    // Multi-token commands should have probabilities of subsequent logits
+    // given that the prior logit is correct.
+    // In this case, all commands must be iterated.
+    // This however, is likely highly involved as different tokens
+    // almost certainly have different spoken lengths
+    // It would also have performance implications equivalent to a beam search
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,         --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,       --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -pms N,     --prompt-ms N    [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
+    fprintf(stderr, "  -cms N,     --command-ms N   [%-7d] command duration in milliseconds\n",            params.command_ms);
+    fprintf(stderr, "  -c ID,      --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,      --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "\n");
+}
+uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
+    using namespace std::chrono;
+    uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
+    uint64_t start_time = time_now;
+    if (jparams.contains("timestamp")) {
+        start_time = jparams.at("timestamp");
+    }
+    if(time_now - start_time < 500) {
+        //wait for a backlog of audio
+        std::this_thread::sleep_for(milliseconds(500 - (time_now - start_time)));
+        time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
+    } else if (time_now - start_time > 1000) {
+        audio.get(time_now-start_time, pcmf32);
+        size_t max_offset = pcmf32.size() - WHISPER_SAMPLE_RATE;
+        for(size_t offset=0;offset < max_offset;offset+=WHISPER_SAMPLE_RATE/10) {
+            std::vector<float> audio_chunk(&pcmf32[offset], &pcmf32[offset+WHISPER_SAMPLE_RATE]);
+            if(::vad_simple(audio_chunk, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+                pcmf32.resize(offset+WHISPER_SAMPLE_RATE);
+                if (offset*1000/WHISPER_SAMPLE_RATE+1000 > maxlength_ms) {
+                    //remove samples from the beginning
+                    pcmf32.erase(pcmf32.begin(),pcmf32.end()-(maxlength_ms*WHISPER_SAMPLE_RATE/1000));
+                    fprintf(stderr, "Shortened samples");
+                }
+                return start_time + offset*1000/WHISPER_SAMPLE_RATE+1000;
+            }
+        }
+    }
+    size_t window_duration = std::max((uint64_t)1000, time_now-start_time);
+    audio.get(window_duration, pcmf32);
+    while (!::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+        std::this_thread::sleep_for(milliseconds(100));
+        time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
+        window_duration = std::max((uint64_t)1000,time_now-start_time);
+        audio.get(window_duration, pcmf32);
+    }
+    if (time_now - start_time > maxlength_ms) {
+        audio.get(maxlength_ms, pcmf32);
+    } else {
+        audio.get(time_now - start_time, pcmf32);
+    }
+
+    return time_now;
+}
+
+json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
+    std::vector<whisper_token> prompt_tokens;
+    std::vector<float> pcmf32;
+    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
+
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+    if (jparams.contains("prompt")) {
+        // unlikely to see much use. Under normal circumstances, no_context would be set to false
+        std::string prompt = jparams.at("prompt");
+        prompt_tokens.resize(1024);
+        int n = whisper_tokenize(ctx, prompt.c_str(), prompt_tokens.data(), 1024);
+        prompt_tokens.resize(n);
+
+        wparams.prompt_tokens    = prompt_tokens.data();
+        wparams.prompt_n_tokens  = prompt_tokens.size();
+    }
+    wparams.print_progress   = false;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = false;
+    wparams.translate        = params.translate;
+    wparams.no_context       = jparams.value("no_context", true);
+    wparams.single_segment   = true;
+    wparams.max_tokens       = params.max_tokens;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = params.n_threads;
+
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;
+    wparams.suppress_non_speech_tokens = true;
+    // run the transformer and a single decoding pass
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
+        throw json{
+            {"code", -32803},
+            {"message", "ERROR: whisper_full() failed"}
+        };
+    }
+    std::string result = whisper_full_get_segment_text(ctx,0);
+    return json {
+        {"transcription", result},
+        {"timestamp", unprocessed_audio_timestamp}
+    };
+}
+
+// command-list mode
+// guide the transcription to match the most likely command from a provided list
+json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
+    struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
+    std::vector<float> pcmf32;
+    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
+
+    fprintf(stderr, "%s: Speech detected! Processing ...\n", __func__);
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+    wparams.print_progress   = false;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = false;
+    wparams.translate        = params.translate;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.max_tokens       = 1;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = params.n_threads;
+
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;
+
+    // TODO: Do some time testing. Does an overly long prompt slow down processing?
+    // Set up command sets/precompute prompts
+    wparams.prompt_tokens    = cs.prompt_tokens.data();
+    wparams.prompt_n_tokens  = cs.prompt_tokens.size();
+    // TODO: properly expose as option
+    wparams.suppress_non_speech_tokens = true;
+
+    // run the transformer and a single decoding pass
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
+        throw json{
+            {"code", -32803},
+            {"message", "ERROR: whisper_full() failed"}//TODO: format string (sprintf?)
+        };
+    }
+
+    // estimate command probability
+    // NOTE: not optimal
+    {
+        const auto * logits = whisper_get_logits(ctx);
+
+        std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
+
+        // compute probs from logits via softmax
+        {
+            float max = -1e9;
+            for (int i = 0; i < (int) probs.size(); ++i) {
+                max = std::max(max, logits[i]);
+            }
+
+            float sum = 0.0f;
+            for (int i = 0; i < (int) probs.size(); ++i) {
+                probs[i] = expf(logits[i] - max);
+                sum += probs[i];
+            }
+
+            for (int i = 0; i < (int) probs.size(); ++i) {
+                probs[i] /= sum;
+            }
+        }
+
+        std::vector<std::pair<float, int>> probs_id;
+
+        // In my testing, the most verbose token is always the desired.
+        // TODO: Trim commandset struct once efficacy has been verified
+        for (int i = 0; i < (int) cs.commands.size(); ++i) {
+            probs_id.emplace_back(probs[cs.commands[i].tokens[0]], i);
+        }
+
+        // sort descending
+        {
+            using pair_type = decltype(probs_id)::value_type;
+            std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
+                    return a.first > b.first;
+                    });
+        }
+        int id = probs_id[0].second;
+        return json{
+            {"command_index", id},
+                {"command_text", cs.commands[id].plaintext},
+                {"timestamp", unprocessed_audio_timestamp},
+        };
+    }
+}
+
+json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
+    // TODO: check for token collision
+    struct commandset cs;
+
+    std::string  k_prompt = " select one from the available words: ";
+    std::set<whisper_token> token_set;
+    whisper_token tokens[32];
+    for (std::string s : jparams) {
+        std::vector<whisper_token> token_vec;
+        // The existing command implementation uses a nested for loop to tokenize single characters
+        // I fail to see the purpose of this when ' a' has a wholly different pronunciation than the start of ' apple'
+        const int n = whisper_tokenize(ctx, (" " + s).c_str(), tokens, 32);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, s.c_str());
+            return 3;
+        }
+        token_vec.push_back(tokens[0]);
+        if (!token_set.insert(tokens[0]).second) {
+            fprintf(stderr, "%s: warning: %s is a duplicate of an existing token\n", __func__, s.c_str());
+            throw json{
+                {"code",-31000},
+                {"message", "Duplicate token in token set: " + s}
+            };
+        }
+        if (n > 1) {// empty string if n=0? Should never occur
+            fprintf(stderr, "%s: error: command is more than a single token: %s\n", __func__, s.c_str());
+        }
+        struct command command = {token_vec, s};
+        cs.commands.push_back(command);
+        k_prompt += s;
+    }
+    k_prompt = k_prompt.substr(0,k_prompt.length()-2) + ". Selected word:";
+    cs.prompt_tokens.resize(1024);
+    int n = whisper_tokenize(ctx, k_prompt.c_str(), cs.prompt_tokens.data(), 1024);
+    cs.prompt_tokens.resize(n);
+    // prepare response
+    int index = commandset_list.size();
+    commandset_list.push_back(cs);
+    return json{{"index",index}};
+}
+json seek(struct whisper_context * ctx, audio_async &audio, json params) {
+    // whisper_state has the pertinent offsets, but there also seem to be a large
+    // number of scratch buffers that would prevent rewinding context in a manner similar to llama
+    // I'll give this a another pass once everything else is implemented,
+    // but for now, it's unsupported
+    throw json{
+        {"code", -32601},
+            {"message", "Seeking is not yet supported."}
+    };
+}
+json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
+    // See: https://www.jsonrpc.org/specification
+    json id = body.at("id");
+    try {
+        std::string version = body.at("jsonrpc");
+        if (version != "2.0") {
+            // unsupported version
+            throw json{
+                {"code", -3260},
+                {"message", "invalid jsonrpc version"}
+            };
+        }
+        std::string method = body.at("method");
+        json jparams = json{{"dummy", "dummy"}};
+        if (body.contains("params"))
+            jparams = body.at("params");
+        json res;
+        // TODO: be consistent about argument order
+        fprintf(stderr, "Dispatching a job\n");
+        if (method == "unguided")                { res = unguided_transcription(ctx, audio, jparams, params); }
+        else if (method == "guided")             { res = guided_transcription(ctx, audio, params, jparams, commandset_list); }
+        else if (method == "seek")               { res = seek(ctx, audio, jparams); }
+        else if (method == "registerCommandset") { res = register_commandset(ctx, jparams, commandset_list); }
+        else if (method == "echo")               { res = jparams; }
+
+
+        return json{
+            {"jsonrpc", "2.0"},
+                {"result", res},
+                {"id", id}
+        };
+    } catch(json ex) {
+        return json {
+            {"jsonrpc", "2.0"},
+                {"error", ex},
+                {"id", id}
+        };
+    }
+}
+
+void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+    std::deque<json> jobqueue;
+    std::vector<struct commandset> commandset_list;
+    while (true) {
+        // For eventual cancellation support, shouldn't block if job exists
+        if (std::cin.rdbuf()->in_avail() > 22 || jobqueue.size() == 0) {
+            int content_length;
+            if (scanf("Content-Length: %d", &content_length) != 1) {
+                fprintf(stderr, "Could not read input: %d", std::cin.peek());
+                return;
+            }
+            // scanf leaves the new lines intact
+            std::cin.ignore(2);
+            if (std::cin.peek() != 13) {
+                // Content-Type. jsonrpc necessitates utf8.
+                std::cin.ignore(200,10);
+            }
+            std::cin.ignore(2);
+            // A message is being sent and blocking is acceptable
+            std::string content(content_length,'\0');
+            std::cin.read(&content[0], content_length);
+            json job = json::parse(content);
+            // TODO: Some messages(cancellation) should skip queue here
+            if (job.is_array()) {
+                // response must also be batched. Will implement later
+                // for (subjob : job.begin())
+                // TODO: At the very least respond with an unsupported error.
+            } else {
+                jobqueue.push_back(job);
+            }
+        }
+        assert(jobqueue.size() > 0);
+        json job = jobqueue.front();
+        json resp = parse_job(job, ctx, audio, params, commandset_list);
+        if (resp != "unfinished") {
+            jobqueue.pop_front();
+            // send response
+            std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
+            fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
+            std::cout.flush();
+
+        }
+    }
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+    // init audio
+
+    audio_async audio(30*1000);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+    // TODO: Investigate why this is required. An extra second of startup latency is not great
+    // wait for 1 second to avoid any buffered noise
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    audio.clear();
+    // TODO: consider some sort of indicator to designate loading has finished?
+    // Potentially better for the client to just start with a non-blocking message (register commands)
+    process_loop(ctx, audio, params);
+
+    audio.pause();
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/lsp/whisper.vim
+++ b/examples/lsp/whisper.vim
@ -0,0 +1,362 @@
+if !exists("g:whisper_dir")
+    let g:whisper_dir = expand($WHISPER_CPP_HOME)
+    if g:whisper_dir == ""
+        echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
+    endif
+endif
+if !exists("g:whisper_lsp_path")
+    let g:whisper_lsp_path = g:whisper_dir .. "lsp"
+    if !filereadable(g:whisper_lsp_path)
+        echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
+        throw "Executable not found"
+    endif
+endif
+if !exists("g:whisper_model_path")
+    " TODO: allow custom paths relative to the repo dir
+    let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
+    if !filereadable(g:whisper_model_path)
+        echoerr "Could not find model at: " .. g:whisper_model_path
+        throw "Model not found"
+    endif
+endif
+let s:output_buffer = bufnr("whisper_log", v:true)
+call setbufvar(s:output_buffer,"&buftype","nofile")
+let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
+" For faster execution. TODO: server load multiple models/run multiple servers?
+" let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]
+
+" requestCommands([params_dict])
+func whisper#requestCommands(...)
+    let l:req = {"method": "guided", "params": {"commandset_index": 0}}
+    if a:0 > 0
+        call extend(l:req.params, a:1)
+    endif
+    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
+endfunction
+
+" doTranscription([params_dict])
+func whisper#doTranscription(...)
+    let l:req = {"method": "unguided", "params": {}}
+    if a:0 > 0
+        call extend(l:req.params, a:1)
+    endif
+    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
+endfunction
+
+" For testing
+func whisper#uppertest(cha)
+    echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
+endfunction
+
+
+" (upper, exit, count, motion, command, insert/append, save run) "base"
+" (upper, exit, count, motion, command, inside/around)           "motion/visual"
+" (upper, exit, count, motion, line,    inside/around)           "command already entered"
+" (upper, exit, key,                                 )           "from/till"
+
+" upper and lower keys is used to translate between cases with tr
+" Must be sunchronized
+let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
+let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
+let s:c_count = split("1234567890\"",'\zs')
+let s:c_command = split("ryuogpdxcv.iam", '\zs')
+let s:c_motion = split("wetf'hjklnb$^)",'\zs')
+" object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
+let s:c_area = split("wsp])>t}\"'",'\zs')
+"Special commands.
+let s:c_special_always = ["exit", "upper"]
+let s:c_special_normal = ["save", "run", "space"]
+
+" If not in dict, key is spoken word,
+" If key resolves to string, value is used for normal/motion, but key for chars
+" If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
+" Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
+let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"},  "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3:  "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis",  3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}
+
+" Give this another pass. This seems overly hacky even if it's functional
+let s:sub_tran_msg = ""
+func s:subTranProg(msg)
+    if s:sub_tran_msg != ""
+        let s:sub_tran_msg = s:sub_tran_msg .. a:msg
+        if mode() !=? 'v'
+            exe "normal" "u" .. s:sub_tran_msg
+        endif
+    else
+        if s:command_backlog == ""
+            " this should not occur
+            call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
+            let s:command_backlog = "a"
+        endif
+        if a:msg[0] == ' '
+            let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
+        else
+            let s:sub_tran_msg = s:command_backlog  .. a:msg
+        endif
+        if mode() !=? 'v'
+            exe "normal" s:sub_tran_msg
+        endif
+    endif
+    call appendbufline(s:output_buffer, "$", s:sub_tran_msg ..  ":" .. string(a:msg ))
+endfunction
+
+func s:subTranFinish(params, timestamp)
+    let s:repeat_command = s:sub_tran_msg
+    " Visual selection is lot if used with streaming, so streaming of partial
+    " transcriptions is disabled in visual mode
+    if mode() ==? 'v'
+        exe "normal" s:sub_tran_msg
+    endif
+    let s:sub_tran_msg = ""
+    let s:command_backlog = ""
+    exe "normal a\<C-G>u"
+    let l:params = a:params
+    let l:params.timestamp = a:timestamp
+    if exists("l:params.commandset_index")
+        unlet l:params.commandset_index
+    endif
+    call whisper#requestCommands(a:params)
+endfunction
+
+func s:logCallback(channel, msg)
+    call appendbufline(s:output_buffer,"$",a:msg)
+endfunction
+
+
+func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
+    let l:tr = a:msg.result.transcription
+
+    let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
+    " The worst case I've observed so far is " Exit.", which is 6 characters
+    if l:ex_ind != -1
+        call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
+        call a:finishedCallback(a:msg.result.timestamp)
+    else
+        call a:progressCallback(l:tr)
+        let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
+        let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
+    endif
+endfunc
+func s:insertText(msg)
+    exe "normal a" .. a:msg
+endfunction
+func s:endTranscription(timestamp)
+    call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
+endfunction
+
+
+
+" If a command does not include a whole actionable step, attempting to execute
+" it discards the remainder of things. There is likely a simpler solution,
+" but it can be made functional now by storing a backbuffer until actionable
+let s:command_backlog = ""
+let s:repeat_command = ""
+let s:preceeding_upper = v:false
+func s:commandCallback(params, commandset_index, channel, msg)
+    let l:command_index = a:msg.result.command_index
+    let l:do_execute = v:false
+    let l:next_mode = a:commandset_index
+    let l:command = s:commandset_list[a:commandset_index][l:command_index]
+    call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
+    if l:command_index == 0
+        "exit
+        "if s:command_backlog == ""
+        call s:logCallback(0,"Stopping command mode")
+        echo "No longer listening"
+        let s:command_backlog = ""
+        return
+        "else
+        " Legacy code to clear an existing buffer with exit.
+        " Was found to be rarely desired and is better introduced as a
+        " standalone command (clear?)
+        "   call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
+        "   let s:command_backlog = ""
+        "   let s:preceeding_upper = v:false
+        " endif
+    elseif l:command_index == 1
+        " upper
+        let s:preceeding_upper = !s:preceeding_upper
+    elseif l:command == "save"
+        " save and run can only happen in commandset 0,
+        exe "w"
+    elseif l:command == "run"
+        exe "make run"
+    elseif l:command == "space"
+        exe "normal i \<ESC>l"
+    elseif has_key(s:c_user, l:command)
+        let Userfunc = s:c_user[l:command]
+        if type(Userfunc) == v:t_string
+            let Userfunc = function(Userfunc)
+        endif
+        call Userfunc()
+    else
+        if s:preceeding_upper
+            " Upper should keep commandset
+            let s:preceeding_upper = v:false
+            let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
+        else
+            let l:visual_command = l:command
+        endif
+        echo s:command_backlog .. " - " .. l:visual_command
+        let s:command_backlog = s:command_backlog .. l:visual_command
+        if a:commandset_index == 2 || a:commandset_index == 3
+            " single key, either completes motion, replace, or register
+            " Should move to execute unless part of a register
+            " Change will be caught at execute
+            if s:command_backlog[-2:-2] !=# '"'
+                call s:logCallback(0,"not register")
+                let l:do_execute = v:true
+            end
+            let l:next_mode = 0
+            " commandset index only matters for a/i
+        elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
+            " inside/around. Is commandset 3
+            let l:next_mode = 3
+        elseif l:command ==# '"'
+            let l:next_mode = 2
+        elseif index(s:c_count, l:command) != -1
+            let l:next_mode = a:commandset_index
+        elseif index(s:c_motion, l:command) != -1
+            if l:command == 't' || l:command == 'f' || l:command == "'"
+                " prompt single key
+                let l:next_mode = 2
+            else
+                let l:do_execute = v:true
+                let l:next_mode = 0
+            endif
+        elseif index(s:c_command, l:command) != -1
+            if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
+                " need motion or repeated command
+                " Potential for bad state here if disparaging command keys are
+                " entered (i.e. yd), but vim can handle checks for this at exe
+                " And checking for cases like y123d would complicate things
+                let l:next_mode = 1
+            elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
+                "'Insert' mode, do general transcription
+                let l:req = {"method": "unguided", "params": a:params}
+                let l:req.params.timestamp = a:msg.result.timestamp
+                let l:req.params.no_context = v:true
+                let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
+                return
+            elseif l:command == 'r' || l:command == 'm'
+                let l:next_mode = 2
+            elseif l:command == '.'
+                let l:next_mode = 0
+                let l:do_execute = v:true
+                let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
+            else
+                if l:command ==? 'v'
+                    let l:next_mode = 1
+                else
+                    let l:next_mode = 0
+                endif
+                let l:do_execute = v:true
+            endif
+        else
+            throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
+        endif
+    endif
+    if l:do_execute
+        if mode() ==?'v' && l:next_mode == 0
+            let l:next_mode = 1
+        elseif match(s:command_backlog, 'c') != -1
+            let l:req = {"method": "unguided", "params": a:params}
+            let l:req.params.timestamp = a:msg.result.timestamp
+            let l:req.params.no_context = v:true
+            let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
+            return
+        endif
+        exe "normal" s:command_backlog
+        if index(s:c_motion + ["u"],l:command) == -1
+            exe "normal a\<C-G>u"
+            let s:repeat_command = s:command_backlog
+            call s:logCallback(0, s:command_backlog)
+        endif
+        let s:command_backlog = ""
+    endif
+    let l:req = {"method": "guided", "params": a:params}
+    let l:req.params.timestamp = a:msg.result.timestamp
+    let l:req.params.commandset_index = l:next_mode
+    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
+endfunction
+
+func s:loadedCallback(channel, msg)
+    echo "Loading complete"
+    call s:logCallback(a:channel, a:msg)
+endfunction
+
+func s:registerCommandset(commandlist, is_final)
+    let req = {"method": "registerCommandset"}
+    let req.params = a:commandlist
+    call s:logCallback(0, join(a:commandlist))
+    call add(g:whisper_commandlist_spoken, a:commandlist)
+    if a:is_final
+        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
+    else
+        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
+    endif
+endfunction
+
+func s:registerAllCommands()
+    let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
+    let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
+    " Currently the same as visual.
+    " let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
+    let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
+    let l:area = s:c_special_always + s:c_area
+
+    " Used only for compatibility with the testing script
+    let g:whisper_commandlist_spoken = []
+
+    let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
+    call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
+    call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
+    call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
+    call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
+endfunction
+
+func s:commandsetToSpoken(commandset, spoken_index)
+    let l:spoken_list = []
+    for l:command in a:commandset
+        if has_key(s:spoken_dict, l:command)
+            let l:spoken_value = s:spoken_dict[l:command]
+            if type(l:spoken_value) == v:t_dict
+                if has_key(l:spoken_value, a:spoken_index)
+                    let l:spoken_value = l:spoken_value[a:spoken_index]
+                else
+                    if a:spoken_index == 2
+                        let l:spoken_value = l:command
+                    else
+                        let l:spoken_value = l:spoken_value[0]
+                    endif
+                endif
+            else
+                if a:spoken_index == 2
+                    let l:spoken_value = l:command
+                endif
+            endif
+        else
+            let l:spoken_value = l:command
+        endif
+        call add(l:spoken_list, l:spoken_value)
+    endfor
+    return l:spoken_list
+endfunction
+
+" TODO: Check lifetime. If the script is resourced, is the existing
+" s:lsp_job dropped and therefore killed?
+" This seems to not be the case and I've had to deal with zombie processes
+" that survive exiting vim, even though said behavior conflicts with my
+" understanding of the provided documentation
+let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
+if !exists("g:lsp_job")
+    if exists("g:whisper_user_commands")
+        let s:c_user = g:whisper_user_commands
+    else
+        let s:c_user = {}
+    endif
+    let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
+    if job_status(g:lsp_job) == "fail"
+        echoerr "Failed to start whisper job"
+    endif
+    call s:registerAllCommands()
+endif
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -59,6 +59,7 @@ struct whisper_params {
    int32_t offset_t_ms  =  0;
    int32_t offset_n     =  0;
    int32_t duration_ms  =  0;
+    int32_t progress_step =  5;
    int32_t max_context  = -1;
    int32_t max_len      =  0;
    int32_t best_of      =  2;
@ -69,6 +70,7 @@ struct whisper_params {
    float logprob_thold = -1.00f;

    bool speed_up        = false;
+    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
    bool diarize         = false;
@ -86,6 +88,7 @@ struct whisper_params {
    bool print_colors    = false;
    bool print_progress  = false;
    bool no_timestamps   = false;
+    bool log_score       = false;

    std::string language  = "en";
    std::string prompt;
@ -133,7 +136,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
@ -158,6 +162,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
+        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -187,7 +192,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
@ -211,6 +217,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
+    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
    fprintf(stderr, "\n");
 }

@ -218,6 +225,7 @@ struct whisper_print_user_data {
    const whisper_params * params;

    const std::vector<std::vector<float>> * pcmf32s;
+    int progress_prev;
 };

 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
@ -252,6 +260,14 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s

    return speaker;
 }
+void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
+    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
+    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
+    if (progress >= *progress_prev + progress_step) {
+        *progress_prev += progress_step;
+        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
+    }
+}

 void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
@ -476,6 +492,25 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    // fprintf(stderr,"segments: %d\n",n_segments);
+    for (int i = 0; i < n_segments; ++i) {
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        // fprintf(stderr,"tokens: %d\n",n_tokens);
+        for (int j = 0; j < n_tokens; j++) {
+            auto token = whisper_full_get_token_text(ctx, i, j);
+            auto probability = whisper_full_get_token_p(ctx, i, j);
+            fout << token << '\t' << probability << std::endl;
+            // fprintf(stderr,"token: %s %f\n",token,probability);
+	    }
+    }
+    return true;
+}
+
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    int indent = 0;
@ -883,6 +918,7 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;

            wparams.speed_up         = params.speed_up;
+            wparams.debug_mode       = params.debug_mode;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]

@ -895,7 +931,7 @@ int main(int argc, char ** argv) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

-            whisper_print_user_data user_data = { &params, &pcmf32s };
+            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
@ -903,6 +939,11 @@ int main(int argc, char ** argv) {
                wparams.new_segment_callback_user_data = &user_data;
            }

+            if (wparams.print_progress) {
+                wparams.progress_callback           = whisper_print_progress_callback;
+                wparams.progress_callback_user_data = &user_data;
+            }
+
            // example for abort mechanism
            // in this example, we do not abort the processing, but we could if the flag is set to true
            // the callback is called before every encoder run - if it returns false, the processing is aborted
@ -967,6 +1008,12 @@ int main(int argc, char ** argv) {
                const auto fname_lrc = fname_out + ".lrc";
                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
            }
+
+            // output to score file
+            if (params.log_score) {
+                const auto fname_score = fname_out + ".score.txt";
+                output_score(ctx, fname_score.c_str(), params, pcmf32s);
+            }
        }
    }

--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -138,7 +138,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        //    return false;
        //}

-        char word[128];
+        char word[129];

        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -47,6 +47,7 @@ struct whisper_params {
    bool print_special = false;
    bool no_context    = true;
    bool no_timestamps = false;
+    bool tinydiarize   = false;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -80,6 +81,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else if (arg == "-tdrz" || arg == "--tinydiarize")  { params.tinydiarize   = true; }
+
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -113,6 +116,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                          params.fname_out.c_str());
+    fprintf(stderr, "  -tdrz,     --tinydiarize  [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
    fprintf(stderr, "\n");
 }

@ -299,6 +303,8 @@ int main(int argc, char ** argv) {
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

+            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
+
            // disable temperature fallback
            //wparams.temperature_inc  = -1.0f;
            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
@ -344,10 +350,19 @@ int main(int argc, char ** argv) {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                        std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "]  " + text;
+
+                        if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
+                            output += " [SPEAKER_TURN]";
+                        }
+
+                        output += "\n";
+
+                        printf("%s", output.c_str());
+                        fflush(stdout);

                        if (params.fname_out.length() > 0) {
-                            fout << "[" << to_timestamp(t0) << " --> " << to_timestamp(t1) << "]  " << text << std::endl;
+                            fout << output;
                        }
                    }
                }
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@ -1164,7 +1164,7 @@ static bool llama_eval_internal(
    const llama_token * tokens,
            const int   n_tokens,
            const int   n_past,
-            const int   n_threads) {
+                  int   n_threads) {

    // enforce that the first token is BOS
    if (n_past == 0 && tokens[0] != llama_token_bos()) {
@ -1190,6 +1190,8 @@ static bool llama_eval_internal(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_embd/hparams.n_head;

+    const float eps = 5e-6f; // TODO: take from hparams
+
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;

@ -1204,7 +1206,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    ggml_set_name(embd, "embd");
@ -1221,7 +1223,7 @@ static bool llama_eval_internal(

        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, eps);

            // cur = cur*attention_norm(broadcasted)
            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
@ -1329,7 +1331,7 @@ static bool llama_eval_internal(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, eps);

                // cur = cur*ffn_norm(broadcasted)
                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
@ -1367,7 +1369,7 @@ static bool llama_eval_internal(
    // norm
    {

-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, eps);

        // inpL = inpL*norm(broadcasted)
        inpL = ggml_mul(ctx0, inpL, model.norm);
@ -1384,8 +1386,8 @@ static bool llama_eval_internal(
    //inpL = ggml_soft_max_inplace(ctx0, inpL);

    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_build_forward_expand  (&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
@ -2488,8 +2490,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            }

            struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+            ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);

            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -2635,7 +2636,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kout3d->data = out;
@ -2655,7 +2655,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);

            ggml_free(cpy_ctx);
        }
@ -2743,7 +2743,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kin3d->data = (void *) inp;
@ -2763,7 +2762,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);

            ggml_free(cpy_ctx);
        }
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
        struct ggml_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
        };

        model.ctx = ggml_init(params);
@ -420,7 +420,6 @@ bool gpt2_eval(

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -442,7 +441,7 @@ bool gpt2_eval(
        // norm
        {
            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL);
+            cur = ggml_norm(ctx0, inpL, 1e-5f);

            // cur = ln_1_g*cur + ln_1_b
            // [ 768, N]
@ -589,7 +588,7 @@ bool gpt2_eval(
        {
            // norm
            {
-                cur = ggml_norm(ctx0, inpFF);
+                cur = ggml_norm(ctx0, inpFF, 1e-5f);

                // cur = ln_2_g*cur + ln_2_b
                // [ 768, N]
@ -644,7 +643,7 @@ bool gpt2_eval(
    // norm
    {
        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);

        // inpL = ln_f_g*inpL + ln_f_b
        // [ 768, N]
@ -664,8 +663,8 @@ bool gpt2_eval(
    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_build_forward_expand  (&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -379,6 +379,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
+// TODO: sync latest version from ggml repo
 bool gpt2_eval(
        const gpt2_model & model,
        const int n_threads,
@ -420,7 +421,6 @@ bool gpt2_eval(

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -442,7 +442,7 @@ bool gpt2_eval(
        // norm
        {
            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL);
+            cur = ggml_norm(ctx0, inpL, 1e-5f);

            // cur = ln_1_g*cur + ln_1_b
            // [ 768, N]
@ -589,7 +589,7 @@ bool gpt2_eval(
        {
            // norm
            {
-                cur = ggml_norm(ctx0, inpFF);
+                cur = ggml_norm(ctx0, inpFF, 1e-5f);

                // cur = ln_2_g*cur + ln_2_b
                // [ 768, N]
@ -644,7 +644,7 @@ bool gpt2_eval(
    // norm
    {
        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, 1e-5f);

        // inpL = ln_f_g*inpL + ln_f_b
        // [ 768, N]
@ -664,8 +664,8 @@ bool gpt2_eval(
    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_build_forward_expand  (&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
--- a/examples/whisper.android/app/build.gradle
+++ b/examples/whisper.android/app/build.gradle
@ -18,6 +18,9 @@ android {
        vectorDrawables {
            useSupportLibrary true
        }
+        ndk {
+            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
+        }
    }

    buildTypes {
@ -42,8 +45,8 @@ android {
    }
    ndkVersion "25.1.8937393"
    externalNativeBuild {
-        ndkBuild {
-            path 'src/main/jni/whisper/Android.mk'
+        cmake {
+            path = file("src/main/jni/whisper/CMakeLists.txt")
        }
    }
    packagingOptions {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -1,26 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE    := libwhisper
-include $(LOCAL_PATH)/Whisper.mk
-include $(BUILD_SHARED_LIBRARY)
-
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-	include $(CLEAR_VARS)
-	LOCAL_MODULE    := libwhisper_vfpv4
-	include $(LOCAL_PATH)/Whisper.mk
-	# Allow building NEON FMA code.
-	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
-	LOCAL_CFLAGS += -mfpu=neon-vfpv4
-	include $(BUILD_SHARED_LIBRARY)
-endif
-
-ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-	include $(CLEAR_VARS)
-	LOCAL_MODULE    := libwhisper_v8fp16_va
-	include $(LOCAL_PATH)/Whisper.mk
-	# Allow building NEON FMA code.
-	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
-	LOCAL_CFLAGS += -march=armv8.2-a+fp16
-	include $(BUILD_SHARED_LIBRARY)
-endif
-
--- a/examples/whisper.android/app/src/main/jni/whisper/Application.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Application.mk
@ -1 +0,0 @@
-APP_STL := c++_static
--- a/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(whisper.cpp)
+
+set(CMAKE_CXX_STANDARD 11)
+set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
+
+set(
+        SOURCE_FILES
+        ${WHISPER_LIB_DIR}/ggml.c
+        ${WHISPER_LIB_DIR}/whisper.cpp
+        ${CMAKE_SOURCE_DIR}/jni.c
+)
+
+find_library(LOG_LIB log)
+
+function(build_library target_name)
+    add_library(
+        ${target_name}
+        SHARED
+        ${SOURCE_FILES}
+    )
+    
+    target_link_libraries(${target_name} ${LOG_LIB} android)
+
+    if (${target_name} STREQUAL "whisper_v8fp16_va")
+        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
+    elseif (${target_name} STREQUAL "whisper_vfpv4")
+        target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
+    endif ()
+
+    if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+
+        target_compile_options(${target_name} PRIVATE -O3)
+        target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
+        target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
+
+        target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
+        target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
+        target_link_options(${target_name} PRIVATE -flto)
+
+    endif ()
+endfunction()
+
+build_library("whisper") # Default target
+
+if (${ANDROID_ABI} STREQUAL "arm64-v8a")
+    build_library("whisper_v8fp16_va")
+elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
+    build_library("whisper_vfpv4")
+endif ()
+
+include_directories(${WHISPER_LIB_DIR})
--- a/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
@ -1,18 +0,0 @@
-WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
-LOCAL_LDLIBS    := -landroid -llog
-
-# Make the final output library smaller by only keeping the symbols referenced from the app.
-ifneq ($(APP_OPTIM),debug)
-    LOCAL_CFLAGS += -O3
-    LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
-    LOCAL_CFLAGS += -ffunction-sections -fdata-sections
-    LOCAL_LDFLAGS += -Wl,--gc-sections
-    LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
-    LOCAL_LDFLAGS += -flto
-endif
-
-LOCAL_CFLAGS    += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
-LOCAL_CPPFLAGS  += -std=c++11
-LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
-                   $(WHISPER_LIB_DIR)/whisper.cpp \
-                   $(LOCAL_PATH)/jni.c
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -0,0 +1,594 @@
+#include "ggml-alloc.h"
+#include "ggml.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
+//#define GGML_ALLOCATOR_DEBUG
+
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}
+
+// TODO: GGML_PAD ?
+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
+    return offset + align;
+}
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+#define MAX_FREE_BLOCKS 128
+
+struct ggml_allocr {
+    void * data;
+    size_t size;
+    size_t alignment;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+    size_t max_size;
+    bool measure;
+    int parse_seq[GGML_MAX_CONCUR];
+    int parse_seq_len;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+#ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == NULL) {
+            alloc->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == tensor ||
+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
+            alloc->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
+
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(alloc);
+}
+
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+
+    size_t max_avail = 0;
+
+    // find the best fitting free block besides the last block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
+        }
+    }
+
+    AT_PRINTF("block %d\n", best_fit_block);
+
+    if (best_fit_block == -1) {
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
+        return;
+        }
+    }
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    void * addr = block->addr;
+    block->addr = (char*)block->addr + size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
+    }
+
+    tensor->data = addr;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, tensor);
+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    if (cur_max > alloc->max_size) {
+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i]) {
+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+}
+
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+        // the tensor was not allocated in this buffer
+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
+        // the easiest way to deal with this is just to ignore it
+        return;
+    }
+
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, tensor);
+#endif
+
+    // see if we can merge with an existing block
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        // check if ptr is at the end of the block
+        if ((char*)block->addr + block->size == ptr) {
+            block->size += size;
+            // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+        // check if ptr is at the beginning of the block
+        if ((char*)ptr + size == block->addr) {
+            block->addr = ptr;
+            block->size += size;
+            // check if we can merge with the previous block
+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+    }
+    // otherwise, add a new block
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = ptr;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;
+}
+
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
+    for (int i = 0; i < n; i++) {
+        alloc->parse_seq[i] = list[i];
+    }
+    alloc->parse_seq_len = n;
+}
+
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
+    alloc->n_free_blocks = 1;
+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
+}
+
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ data,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+void ggml_allocr_free(struct ggml_allocr * alloc) {
+    free(alloc);
+}
+
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
+    return alloc->measure;
+}
+
+//////////// compute graph allocator
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SET:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
+        case GGML_OP_ADD_REL_POS:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
+    if (node->data == NULL) {
+        if (ggml_is_view(node)) {
+            size_t offset;
+            switch(node->op) {
+                case GGML_OP_VIEW:
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    node->data = (char *) node->src[0]->data + offset;
+                    break;
+                case GGML_OP_PERMUTE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_TRANSPOSE:
+                    node->data = node->src[0]->data;
+                    break;
+                case GGML_OP_CPY:
+                    node->data = node->src[1]->data;
+                    break;
+                default:
+                    GGML_ASSERT(!"unknown view op");
+                    break;
+            }
+        } else {
+            // see if we can reuse a parent's buffer (inplace)
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
+                                return;
+                            }
+                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
+                            return;
+                        }
+                    }
+                }
+            }
+            ggml_allocr_alloc(alloc, node);
+        }
+    }
+}
+
+static size_t ggml_allocator_alloc_graph_tensors_n(
+    struct ggml_allocr * alloc,
+    struct ggml_cgraph ** graphs, int n_graphs,
+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
+
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
+
+    // count number of children and views
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * view_src = get_view_source(node);
+                hash_get(ht, view_src)->n_views += 1;
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                hash_get(ht, parent)->n_children += 1;
+            }
+        }
+    }
+
+    // allocate tensors
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
+        if (inputs != NULL && inputs[g] != NULL) {
+            for (int i = 0; inputs[g][i] != NULL; i++) {
+                struct ggml_tensor * input = inputs[g][i];
+                AT_PRINTF("input: %s\n", input->name);
+                allocate_node(alloc, input);
+            }
+        }
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+        int last_barrier_pos = 0;
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+
+        for (int ind = 0; ind < n_nodes; ind++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
+                struct ggml_tensor * node = gf->nodes[i];
+
+                // allocate parents (leafs)
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    allocate_node(alloc, parent);
+                }
+
+                // allocate node
+                allocate_node(alloc, node);
+
+                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    AT_PRINTF("%s", parent->name);
+                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                        AT_PRINTF(", ");
+                    }
+                }
+                AT_PRINTF("\n");
+            }
+
+
+            // update parents
+            // update immediately if there is no parse_seq
+            // update only at barriers if there is parse_seq
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
+                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
+                for (int i = update_start; i < update_end; i++) {
+                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
+                    struct ggml_tensor * node = gf->nodes[node_i];
+
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * parent = node->src[j];
+                        if (parent == NULL) {
+                            break;
+                        }
+                        struct hash_node * p_hn = hash_get(ht, parent);
+                        p_hn->n_children -= 1;
+
+                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                            if (ggml_is_view(parent)) {
+                                struct ggml_tensor * view_src = get_view_source(parent);
+                                struct hash_node * view_src_hn = hash_get(ht, view_src);
+                                view_src_hn->n_views -= 1;
+                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, view_src);
+                                }
+                            }
+                            else {
+                                if (parent->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, parent);
+                                }
+                            }
+                        }
+                    }
+                }
+                AT_PRINTF("\n");
+                if (alloc->parse_seq_len) {
+                    last_barrier_pos = ind + 1;
+                }
+            }
+        }
+        // free graph outputs here that wouldn't be freed otherwise because they have no children
+        if (outputs != NULL && outputs[g] != NULL) {
+            for (int i = 0; outputs[g][i] != NULL; i++) {
+                struct ggml_tensor * output = outputs[g][i];
+                AT_PRINTF("output: %s\n", output->name);
+                ggml_allocator_free_tensor(alloc, output);
+            }
+        }
+    }
+
+    return alloc->max_size;
+}
+
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
+
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -2,34 +2,44 @@

 #include "ggml.h"

+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

 #define GGML_CUDA_MAX_DEVICES       16

-void   ggml_init_cublas(void);
-void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_init_cublas(void);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);

-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);

-// TODO: export these with GGML_API
-void * ggml_cuda_host_malloc(size_t size);
-void   ggml_cuda_host_free(void * ptr);
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);

-void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);

-void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void   ggml_cuda_set_main_device(int main_device);
-void   ggml_cuda_set_scratch_size(size_t scratch_size);
-void   ggml_cuda_free_scratch(void);
-bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 #ifdef  __cplusplus
 }
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -24,6 +24,7 @@

 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32

 struct ggml_tensor;
 struct ggml_cgraph;
@ -34,9 +35,16 @@ extern "C" {

 struct ggml_metal_context;

-struct ggml_metal_context * ggml_metal_init(void);
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
 // - the mapping is used during computation to determine the arguments of the compute kernels
@ -57,6 +65,16 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 // get data from the device into host memory
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
+
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7

-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
    const int is = 0;
-#else
+
+\n#else\n
+
    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
    const int is = in / 4;
-#endif
+
+\n#endif\n
+
    const int ql_offset = 64*im + l0;
    const int qh_offset = 32*im + l0;
    const int s_offset  =  8*im + is;
@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,

        const float d = vload_half(0, &x[i].d);

-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
        tmp[16 * ix + tid] += sum;
-#else
+\n#else\n
        float sum = 0;
        for (int l = 0; l < 4; ++l) {
            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
        }
        tmp[16 * ix + tid] += sum;
-#endif
+\n#endif\n

    }

@ -1330,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
        return;
    }

-    cl_mem mem = (cl_mem)tensor->data;
+    cl_mem mem = (cl_mem)tensor->extra;
    clReleaseMemObject(mem);
 }

@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
@ -1389,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    size_t d_size;

    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst


@ -1487,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@ -1563,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
    }
@ -1693,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->data;
+                d_Q = (cl_mem) src0->extra;
            } else {
                GGML_ASSERT(false);
            }
@ -1856,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {

    CL_CHECK(clFinish(queue));

-    tensor->data = dst;
+    tensor->extra = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
--- a/grammars/assistant.gbnf
+++ b/grammars/assistant.gbnf
@ -0,0 +1,57 @@
+# - "turn on lights."
+# - "set thermostat to 22."
+# - "increase TV by 10."
+# - "decrease oven by 50."
+# - "play music."
+# - "stop podcast."
+# - "schedule cleaning at 3pm."
+# - "cancel cleaning."
+# - "remind me to buy milk at 5pm."
+# - "show me security system."
+# - "hide washing machine."
+# - "what is the lights status?"
+# - "what is the current thermostat value?"
+# - "what is the security system status?"
+# - "what is the door lock status?"
+# - "what is the camera battery level?"
+# - "what is the weather like today?"
+# - "what is the forecast for tomorrow?"
+# - "what is the time?"
+# - "what is my schedule for today?"
+# - "what tasks do I have?"
+# - "what reminders do I have?"
+#
+# example:
+#
+#   ./command -m ./models/ggml-tiny.en.bin -t 8 --grammar ./grammars/assistant.gbnf --prompt "Ok Whisper, start listening for commands." --context "Whisper is a home assistant. It recognizes voice commands. Time is 11pm." --grammar-penalty 10
+#
+
+root   ::= init " " (command | question) "."
+prompt ::= init
+
+# leading space is very important!
+init ::= " Ok Whisper, start listening for commands."
+
+command ::= "Turn " ("on" | "off") " " device | "Set " device " to " value |
+            "Increase " device " by " value | "Decrease " device " by " value |
+            "Play " media | "Stop " media | "Schedule " task " at " time | "Cancel " task |
+            "Remind me to " task " at " time | "Show me " device | "Hide " device
+
+question ::= "What is the " device " status?" | "What is the current " device " value?" |
+             "What is the " device " temperature?" | "What is the " device " humidity?" |
+             "What is the " device " power consumption?" | "What is the " device " battery level?" |
+             "What is the weather like today?" | "What is the forecast for tomorrow?" |
+             "What is the time?" | "What is my schedule for today?" | "What tasks do I have?" |
+             "What reminders do I have?"
+
+device ::= "lights" | "thermostat" | "security system" | "door lock" | "camera" | "speaker" | "TV" |
+           "music player" | "coffee machine" | "oven" | "refrigerator" | "washing machine" |
+           "vacuum cleaner"
+
+value ::= [0-9]+
+
+media ::= "music" | "radio" | "podcast" | "audiobook" | "TV show" | "movie"
+
+task ::= [a-zA-Z]+ (" " [a-zA-Z]+)?
+
+time ::= [0-9] [0-9]? ("am" | "pm")?
--- a/grammars/chess.gbnf
+++ b/grammars/chess.gbnf
@ -0,0 +1,29 @@
+# - bishop to c3
+# - rook to d4
+# - knight to e5
+# - d4 d5 knight to c3
+# - c3 queen to d4 king b1
+# - pawn to a1 bishop to b2 knight to c3
+#
+# The prompt (--prompt) is the initial phrase that the user has to say.
+# This is used to prime Whisper with how the user is expected to speak.
+#
+# Provide long context (--context) with sample moves to help Whisper decode the correct sequence.
+# Longer context is better, but it slightly increases the processing time.
+#
+# example:
+#
+#   ./command -m ./models/ggml-tiny.en.bin -t 8 --grammar ./grammars/chess.gbnf --prompt "rook to b4, f3," --context "d4 d5 knight to c3, pawn to a1, bishop to b2 king e8," --grammar-penalty 100
+#
+
+root   ::= init move move? move? "."
+prompt ::= init "."
+
+# leading space is very important!
+init ::= " rook to b4, f3"
+
+move ::= ", " ((piece | pawn | king) " " "to "?)? [a-h] [1-8]
+
+piece ::= "bishop" | "rook" | "knight" | "queen"
+king  ::= "king"
+pawn  ::= "pawn"
--- a/grammars/colors.gbnf
+++ b/grammars/colors.gbnf
@ -0,0 +1,16 @@
+# - red
+# - green
+# - blue
+#
+# example:
+#
+#   ./command -m ./models/ggml-tiny.en.bin -t 8 --grammar ./grammars/colors.gbnf --prompt "red, green, blue," --context "green, red, blue,"
+#
+
+root   ::= init color "."
+prompt ::= init "."
+
+# leading space is very important!
+init ::= " red, green, blue"
+
+color ::= ", " ("red" | "green" | "blue")
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -7,6 +7,7 @@ from torch import Tensor
 from torch import nn
 from typing import Dict
 from typing import Optional
+from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
 from coremltools.models.neural_network.quantization_utils import quantize_weights
 from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
 from whisper import load_model
@ -31,12 +32,12 @@ def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
    state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
    return state_dict

-class LayerNorm(nn.LayerNorm):
-    def forward(self, x: Tensor) -> Tensor:
-        x = x.transpose(1,3)
-        x = super().forward(x)
-        x = x.transpose(1,3)
-        return x
+class LayerNormANE(LayerNormANEBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._register_load_state_dict_pre_hook(
+            correct_for_bias_scale_order_inversion)

 class MultiHeadAttentionANE(MultiHeadAttention):
    def __init__(self, n_state: int, n_head: int):
@ -103,9 +104,9 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__(n_state, n_head, cross_attention)
        self.attn =  MultiHeadAttentionANE(n_state, n_head)
-        self.attn_ln = LayerNorm(n_state)
+        self.attn_ln = LayerNormANE(n_state)
        self.cross_attn =  MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
-        self.cross_attn_ln =  LayerNorm(n_state) if cross_attention else None
+        self.cross_attn_ln =  LayerNormANE(n_state) if cross_attention else None

        n_mlp = n_state * 4
        self.mlp =  nn.Sequential(
@ -113,7 +114,7 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
            nn.GELU(),
            nn.Conv2d(n_mlp, n_state, kernel_size=1)
        )
-        self.mlp_ln = LayerNorm(n_state)
+        self.mlp_ln = LayerNormANE(n_state)


 class AudioEncoderANE(AudioEncoder):
@ -123,7 +124,7 @@ class AudioEncoderANE(AudioEncoder):
        self.blocks = nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
        )
-        self.ln_post = LayerNorm(n_state)
+        self.ln_post = LayerNormANE(n_state)

    def forward(self, x: Tensor):
        """
@ -167,7 +168,7 @@ class TextDecoderANE(TextDecoder):
        self.blocks= nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
        )
-        self.ln= LayerNorm(n_state)
+        self.ln= LayerNormANE(n_state)

    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
        """
--- a/models/generate-coreml-interface.sh
+++ b/models/generate-coreml-interface.sh
@ -8,7 +8,7 @@
 wd=$(dirname "$0")
 cd "$wd/../"

-python3 models/convert-whisper-to-coreml.py --model tiny.en --optimize-ane True
+python3 models/convert-whisper-to-coreml.py --model tiny.en

 mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
 xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -13,7 +13,7 @@ mname="$1"
 wd=$(dirname "$0")
 cd "$wd/../"

-python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True --optimize-ane True
+python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True

 xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
 rm -rf models/ggml-${mname}-encoder.mlmodelc
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -67,6 +67,7 @@ extern "C" {

    struct whisper_context;
    struct whisper_state;
+    struct whisper_full_params;

    typedef int whisper_token;

@ -95,6 +96,37 @@ extern "C" {
        void  (*close)(void * ctx);
    } whisper_model_loader;

+    // grammar element type
+    enum whisper_gretype {
+        // end of rule definition
+        WHISPER_GRETYPE_END            = 0,
+
+        // start of alternate definition for rule
+        WHISPER_GRETYPE_ALT            = 1,
+
+        // non-terminal element: reference to rule
+        WHISPER_GRETYPE_RULE_REF       = 2,
+
+        // terminal element: character (code point)
+        WHISPER_GRETYPE_CHAR           = 3,
+
+        // inverse char(s) ([^a], [^a-b] [^abc])
+        WHISPER_GRETYPE_CHAR_NOT       = 4,
+
+        // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+        // be an inclusive range ([a-z])
+        WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
+
+        // modifies a preceding WHISPER_GRETYPE_CHAR or
+        // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+        WHISPER_GRETYPE_CHAR_ALT       = 6,
+    };
+
+    typedef struct whisper_grammar_element {
+        enum whisper_gretype type;
+        uint32_t             value; // Unicode code point or rule ID
+    } whisper_grammar_element;
+
    // Various functions for loading a ggml whisper model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@ -345,7 +377,7 @@ extern "C" {
                              void * user_data);

    // Parameters for the whisper_full() function
-    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
+    // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
    // whisper_full_default_params()
    struct whisper_full_params {
        enum whisper_sampling_strategy strategy;
@ -357,6 +389,7 @@ extern "C" {

        bool translate;
        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
+        bool no_timestamps;     // do not generate timestamps
        bool single_segment;    // force single segment output (useful for streaming)
        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
        bool print_progress;    // print progress information
@ -374,6 +407,7 @@ extern "C" {
        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
        int  audio_ctx;         // overwrite the audio context size (0 = use default)

        // [EXPERIMENTAL] [TDRZ] tinydiarize
@ -429,6 +463,11 @@ extern "C" {
        // called by each decoder to filter obtained logits
        whisper_logits_filter_callback logits_filter_callback;
        void * logits_filter_callback_user_data;
+
+        const whisper_grammar_element ** grammar_rules;
+        size_t                           n_grammar_rules;
+        size_t                           i_start_rule;
+        float                            grammar_penalty;
    };

    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
@ -517,6 +556,11 @@ extern "C" {
    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);

+    // Control logging output; default behavior is to print to stderr
+
+    typedef void (*whisper_log_callback)(const char * line);
+    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
+
 #ifdef __cplusplus
 }
 #endif