release : v1.7.5

bench : update numbers [no ci] (#2993 )
sync : ggml
2025-06-24 09:10:57 +00:00 · 2025-04-02 16:39:48 +03:00 · 2025-04-02 16:27:36 +03:00 · 2025-04-02 15:51:57 +03:00 · 2025-04-02 15:51:57 +03:00 · 2025-04-02 15:51:57 +03:00
197 changed files with 14480 additions and 7562 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -6,17 +6,81 @@ on:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
+  workflow_dispatch:
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+      pre_release_tag:
+        description: 'Pre-release tag name'
+        required: false
+        type: string
+      run_type:
+        description: 'Workflow type to run'
+        required: true
+        type: choice
+        options:
+          - full-ci
+          - release-only

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+permissions:
+  contents: write  # for creating release
+
 env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  ubuntu_image: "ubuntu:22.04"
  VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"

 jobs:
+  determine-tag:
+    runs-on: ubuntu-latest
+    outputs:
+      tag_name: ${{ steps.tag.outputs.name }}
+
+    steps:
+      - name: Checkout with full history
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER=$(git rev-list --count HEAD)
+          SHORT_HASH=$(git rev-parse --short=7 HEAD)
+          CUSTOM_TAG="${{ github.event.inputs.pre_release_tag }}"
+
+          echo "Raw values:"
+          echo "BUILD_NUMBER: $BUILD_NUMBER"
+          echo "SHORT_HASH: $SHORT_HASH"
+          echo "BRANCH_NAME: ${{ env.BRANCH_NAME }}"
+          echo "CUSTOM_TAG: $CUSTOM_TAG"
+
+          # Use custom tag if provided
+          if [[ -n "$CUSTOM_TAG" ]]; then
+            echo "Using custom tag"
+            TAG_NAME="${CUSTOM_TAG}"
+          elif [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "Using master branch format"
+            TAG_NAME="b${BUILD_NUMBER}"
+          else
+            echo "Using non-master branch format"
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            TAG_NAME="${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}"
+          fi
+
+          echo "Final tag name: $TAG_NAME"
+          echo "name=$TAG_NAME" >> $GITHUB_OUTPUT
+
  ubuntu-22:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -43,6 +107,8 @@ jobs:
            cmake --build build --config Release -j $(nproc)'

  ubuntu-22-arm64:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -69,6 +135,8 @@ jobs:
            cmake --build build --config Release -j $(nproc)'

  ubuntu-22-arm-v7:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -95,6 +163,8 @@ jobs:
            cmake --build build --config Release -j $(nproc)'

  macOS-latest:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: macOS-latest

    strategy:
@ -129,31 +199,28 @@ jobs:
            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh

+  freeBSD-latest:
+    runs-on: macos-13

-#  freeBSD-latest:
-#    runs-on: macos-12
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Build
-#        uses: cross-platform-actions/action@v0.24.0
-#        with:
-#          operating_system: freebsd
-#          version: '13.3'
-#          run: |
-#            sudo pkg update
-#            sudo pkg install -y gmake sdl2 cmake
-#            cmake -B build
-#            cmake --build build --config Release
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: Build
+        uses: cross-platform-actions/action@v0.27.0
+        with:
+          operating_system: freebsd
+          version: '14.2'
+          run: |
+            sudo pkg update
+            sudo pkg install -y gmake sdl2 cmake git
+            cmake -B build
+            cmake --build build --config Release

  ubuntu-22-gcc:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -182,6 +249,8 @@ jobs:
            ctest -L gh --output-on-failure'

  ubuntu-22-gcc-arm64:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -210,6 +279,8 @@ jobs:
            ctest -L gh --output-on-failure'

  ubuntu-22-gcc-arm-v7:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -238,6 +309,8 @@ jobs:
            ctest -L gh --output-on-failure'

  ubuntu-22-clang:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -269,6 +342,8 @@ jobs:
            ctest -L gh --output-on-failure'

  ubuntu-22-gcc-sanitized:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -292,11 +367,15 @@ jobs:
            set -e
            apt update
            apt install -y build-essential cmake git
-            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
+            cmake . -DCMAKE_BUILD_TYPE=Debug \
+              -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DGGML_OPENMP=OFF
            make
            ctest -L gh --output-on-failure'

  ubuntu-22-cmake-sycl:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -347,6 +426,8 @@ jobs:
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -397,6 +478,8 @@ jobs:
          cmake --build . --config Release -j $(nproc)

  windows-msys2:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: windows-latest

    strategy:
@ -441,6 +524,8 @@ jobs:
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: windows-latest

    strategy:
@ -501,6 +586,8 @@ jobs:
          path: build/bin/${{ matrix.build }}

  windows-blas:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: windows-latest

    strategy:
@ -574,6 +661,8 @@ jobs:
          path: build/bin/${{ matrix.build }}

  windows-cublas:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: windows-2019
    strategy:
      matrix:
@ -590,15 +679,134 @@ jobs:
      - name: Clone repository
        uses: actions/checkout@v4

+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ${{ github.job }}-${{ matrix.cuda-toolkit }}-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 5d
+
+      - name: Install Cuda Toolkit 11.8.0
+        if: ${{ matrix.cuda-toolkit == '11.8.0' }}
+        run: |
+          $CUDA_VERSION = ${{ matrix.cuda-toolkit }}
+          $CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
+          $CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"
+
+          # Components versions
+          $CUDART_VER = "11.8.89"
+          $NVCC_VER   = "11.8.89"
+          $NVRTC_VER  = "11.8.89"
+          $CUBLAS_VER = "11.8.1.74"
+          $NVTX_VER   = "11.8.86"
+          $VS_VER     = "11.8.86"
+          $NVPROF_VER = "11.8.87"
+          $CCCL_VER   = "11.8.89"
+
+          # Create the directory where the CUDA Toolkit will be installed
+          mkdir -p $CUDA_TOOLKIT_DIR
+
+          # Install unzip to extract the downloaded files
+          choco install unzip -y
+
+          # Download all the required components
+          curl -O "$CUDA_DOWNLOAD/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-${CUDART_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-${NVCC_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/libcublas/windows-x86_64/libcublas-windows-x86_64-${CUBLAS_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-${NVTX_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-${VS_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-${CCCL_VER}-archive.zip"
+
+          # Extract all the downloaded files to the CUDA Toolkit directory
+          unzip '*.zip' -d $CUDA_TOOLKIT_DIR
+
+          # Copy all the extracted files to the main CUDA Toolkit directory
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_cudart-windows-x86_64-${CUDART_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvcc-windows-x86_64-${NVCC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\libcublas-windows-x86_64-${CUBLAS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvtx-windows-x86_64-${NVTX_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_cccl-windows-x86_64-${CCCL_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+
+          # Visual Studio integration
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
+
+          # Set environment variables
+          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "$CUDA_TOOLKIT_DIR\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_8=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+        
+      - name: Install Cuda Toolkit 12.2.0
+        if: ${{ matrix.cuda-toolkit == '12.2.0' }}
+        run: |
+          $CUDA_VERSION = ${{ matrix.cuda-toolkit }}
+          $CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
+          $CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"
+
+          # Components versions
+          $CUDART_VER   = "12.2.140"
+          $NVCC_VER     = "12.2.140"
+          $NVRTC_VER    = "12.2.140"
+          $CUBLAS_VER   = "12.2.5.6"
+          $NVTX_VER     = "12.2.140"
+          $PROFILER_VER = "12.2.140"
+          $VS_VER       = "12.2.140"
+          $NVPROF_VER   = "12.2.142"
+          $CCCL_VER     = "12.2.140"
+
+          # Create the directory where the CUDA Toolkit will be installed
+          mkdir -p $CUDA_TOOLKIT_DIR
+
+          # Install unzip to extract the downloaded files
+          choco install unzip -y
+
+          # Download all the required components
+          curl -O "$CUDA_DOWNLOAD/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-${CUDART_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-${NVCC_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/libcublas/windows-x86_64/libcublas-windows-x86_64-${CUBLAS_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-${NVTX_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-${PROFILER_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-${VS_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive.zip"
+          curl -O "$CUDA_DOWNLOAD/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-${CCCL_VER}-archive.zip"
+
+          # Extract all the downloaded files to the CUDA Toolkit directory
+          unzip -q '*.zip' -d $CUDA_TOOLKIT_DIR
+
+          # Copy all the extracted files to the main CUDA Toolkit directory
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_cudart-windows-x86_64-${CUDART_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvcc-windows-x86_64-${NVCC_VER}-archive\*"     "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive\*"   "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\libcublas-windows-x86_64-${CUBLAS_VER}-archive\*"   "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvtx-windows-x86_64-${NVTX_VER}-archive\*"     "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_cccl-windows-x86_64-${CCCL_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\cuda_profiler_api-windows-x86_64-${PROFILER_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
+
+          # Visual Studio integration
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
+
+          # Set environment variables
+          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "$CUDA_TOOLKIT_DIR\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_2=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v2

-      - name: Install CUDA Toolkit
-        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.15
-        with:
-          cuda: '${{ matrix.cuda-toolkit }}'
-
      - name: Install 7-Zip
        run: choco install 7zip -y

@ -610,25 +818,30 @@ jobs:
          echo "SDL2_DIR=${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" > SDL2_PATH.txt

-      - name: Configure CMake
-        shell: cmd
-        run: |
-          cmake -S . -B ./build -A ${{ matrix.arch }} ^
-            -DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
-            -DGGML_CUDA=${{ matrix.cublas }} ^
-            -DCMAKE_CUDA_ARCHITECTURES=all  ^
-            -DWHISPER_SDL2=${{ matrix.sdl2 }} ^
-            -DSDL2_DIR="%SDL2_DIR%"
+      - name: Install cmake
+        run: choco install cmake

      - name: Build Project
        shell: cmd
        run: |
-          cd ./build
-          cmake --build . --config ${{ matrix.build }}
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake --version
+          where cmake
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
+            -DGGML_CUDA=${{ matrix.cublas }} ^
+            -DWHISPER_SDL2=${{ matrix.sdl2 }} ^
+            -DSDL2_DIR="%SDL2_DIR%"
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config ${{ matrix.build }} -j %NUMBER_OF_PROCESSORS%
+
+      - name: Check sccache status after build
+        run: |
+          sccache --show-stats

      - name: Copy CUDA DLLs
        run: |
-          Get-ChildItem "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/" -Filter "*.dll" |
+          Get-ChildItem "$env:CUDA_PATH\bin\" -Filter "*.dll" |
          Copy-Item -Destination "build/bin/${{ matrix.build }}"

      - name: Copy SDL2.dll
@ -642,6 +855,8 @@ jobs:
          path: build/bin/${{ matrix.build }}

  emscripten:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    strategy:
@ -665,6 +880,7 @@ jobs:

  ios-xcode-build:
    runs-on: macos-latest
+    needs: determine-tag

    strategy:
      matrix:
@ -707,7 +923,26 @@ jobs:
      - name: Build swiftui example
        run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+                github.event.inputs.create_release == 'true' ||
+                github.event.inputs.pre_release_tag != '' }}
+        run: |
+          zip --symlinks -r whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip build-apple/whisper.xcframework
+
+      - name: Upload artifacts
+        if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+                github.event.inputs.create_release == 'true' ||
+                github.event.inputs.pre_release_tag != '' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip
+          name: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework
+
  android:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    steps:
@ -736,31 +971,30 @@ jobs:
          cd whisper/examples/whisper.android
          ./gradlew assembleRelease --no-daemon

-# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
-#  android_java:
-#    runs-on: ubuntu-22.04
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: set up JDK 11
-#        uses: actions/setup-java@v4
-#        with:
-#          java-version: '11'
-#          distribution: 'temurin'
-#          cache: gradle
-#
-#      - name: Setup Android SDK
-#        uses: android-actions/setup-android@v3
-#        with:
-#          cmdline-tools-version: 9.0
-#
-#      - name: Build
-#        run: |
-#          cd examples/whisper.android.java
-#          chmod +x ./gradlew
-#          ./gradlew assembleRelease
+  android_java:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: set up JDK 11
+        uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: 'temurin'
+          cache: gradle
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@v3
+        with:
+          cmdline-tools-version: 9.0
+
+      - name: Build
+        run: |
+          cd examples/whisper.android.java
+          chmod +x ./gradlew
+          ./gradlew assembleRelease

 # TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
 #  java:
@ -807,6 +1041,8 @@ jobs:
 #          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}

  quantize:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
    runs-on: ubuntu-22.04

    steps:
@ -819,3 +1055,95 @@ jobs:
          cmake -B build
          cmake --build build --config Release
          ./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
+
+  release:
+    if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - determine-tag
+      - ios-xcode-build
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: release
+          evict-old-files: 1d
+
+      # Downloads all the artifacts from the previous jobs
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ needs.determine-tag.outputs.tag_name }}
+          prerelease: ${{ github.event.inputs.pre_release_tag != '' }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact/release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                });
+              }
+            }
+
+  coreml-base-en:
+    if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+            github.event.inputs.create_release == 'true' ||
+            github.event.inputs.pre_release_tag != '' }}
+    runs-on: macos-latest
+    needs: determine-tag
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set environment variables
+        id: set_vars
+        run: |
+          echo "MODEL_NAME=base.en" >> $GITHUB_ENV
+          echo "GEN_MODEL_NAME=whisper-${{ needs.determine-tag.outputs.tag_name }}-ggml-base.en-encoder.mlmodelc" >> $GITHUB_ENV
+
+      - name: Download model
+        run: |
+          ./models/download-ggml-model.sh ${{ env.MODEL_NAME }}
+
+      - name: Generate CoreML model
+        run: |
+          python3.11 -m venv venv
+          source venv/bin/activate
+          pip install ane_transformers openai-whisper coremltools
+          ./models/generate-coreml-model.sh ${{ env.MODEL_NAME }}
--- a/.github/workflows/examples-wasm.yml
+++ b/.github/workflows/examples-wasm.yml
@ -0,0 +1,91 @@
+name: Examples WASM
+on:
+  push:
+    branches: ["master"]
+
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  deploy-wasm-github-pages:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Pages
+        uses: actions/configure-pages@v4
+
+      - name: Setup emsdk
+        uses: mymindstorm/setup-emsdk@v14
+
+      - name: Build WASM Examples
+        # Enable for real build later in whisper.cpp
+        run: |
+          mkdir -p build-em && cd build-em
+          emcmake cmake .. -DCMAKE_BUILD_TYPE=Release
+          make -j
+
+      - name: Create staging directory
+        run: mkdir -p staging
+
+      - name: Create .nojekyll file in staging directory
+        run: touch staging/.nojekyll
+
+      - name: Copy application files
+        run: |
+          build_dir=build-em/bin
+
+          ls ${build_dir}
+
+          # command.wasm
+          target_dir=staging/command.wasm
+          mkdir -p ${target_dir}
+          cp ${build_dir}/command.wasm/{index.html,command.js,helpers.js} ${target_dir}
+          cp ${build_dir}/libcommand.js ${target_dir}
+
+          # bench.wasm
+          target_dir=staging/bench.wasm
+          mkdir -p ${target_dir}
+          cp ${build_dir}/bench.wasm/{index.html,bench.js,helpers.js} ${target_dir}
+          cp ${build_dir}/libbench.js ${target_dir}
+
+          # stream.wasm
+          target_dir=staging/stream.wasm
+          mkdir -p ${target_dir}
+          cp ${build_dir}/stream.wasm/{index.html,stream.js,helpers.js} ${target_dir}
+          cp ${build_dir}/libstream.js ${target_dir}
+
+          # whisper.wasm (this will be the main example page)
+          target_dir=staging
+          mkdir -p ${target_dir}
+          cp ${build_dir}/whisper.wasm/{index.html,main.js,helpers.js} ${target_dir}
+          cp ${build_dir}/libmain.js ${target_dir}
+
+          # Copy Cross-Origin Isolation service worker
+          cp -v examples/coi-serviceworker.js staging/
+
+      - name: List files in staging directory (for debugging)
+        run: |
+          echo "Files in staging directory:"
+          find staging -type f | sort
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./staging
+
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.7.4)
+project("whisper.cpp" VERSION 1.7.5)
 include(CheckIncludeFileCXX)

 set(SOVERSION 1)
@ -38,8 +38,13 @@ if (EMSCRIPTEN)

    # TODO: without these, we get the following error:
    #       wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
-    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s TOTAL_STACK=5242880")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s TOTAL_STACK=5242880")
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -62,7 +67,8 @@ option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"
 option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)

 # build
-option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
+option(WHISPER_FATAL_WARNINGS  "whisper: enable -Werror flag"               OFF)
+option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)

 # sanitizers
 option(WHISPER_SANITIZE_THREAD    "whisper: enable thread sanitizer"    OFF)
@ -121,7 +127,15 @@ whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 #

 if (NOT TARGET ggml)
-    add_subdirectory(ggml)
+    if (WHISPER_USE_SYSTEM_GGML)
+        find_package(ggml REQUIRED)
+        if (NOT ggml_FOUND)
+            message(FATAL_ERROR "System-installed GGML library not found.")
+        endif()
+        add_library(ggml ALIAS ggml::ggml)
+    else()
+        add_subdirectory(ggml)
+    endif()
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
@ -176,8 +190,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
 #

 if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    #include(CTest)
-    #add_subdirectory(tests)
+    include(CTest)
+    add_subdirectory(tests)
 endif ()

 if (WHISPER_BUILD_EXAMPLES)
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@
 > [!NOTE]
 > New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788

-Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.7.5](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.5) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -184,11 +184,11 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
  ```

  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
-  - Python 3.10 is recommended.
+  - Python 3.11 is recommended.
  - MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
-    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
-    - To activate the environment, use: `conda activate py310-whisper`
+    - To create an environment, use: `conda create -n py311-whisper python=3.11 -y`
+    - To activate the environment, use: `conda activate py311-whisper`

 - Generate a Core ML model. For example, to generate a `base.en` model, use:

@ -427,7 +427,8 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume

 This is a naive example of performing real-time inference on audio from your microphone.
 The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
-More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
+More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). 
+You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly. 

 ```bash
 cmake -B build -DWHISPER_SDL2=ON
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -11,11 +11,11 @@ UNAME_M := $(shell uname -m)
 endif

 GGML_METAL_PATH_RESOURCES := $(abspath ../..)
-BUILD_DIR := build
+BUILD_DIR := build_go
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
 INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
-LIBRARY_PATH := $(abspath ../..)
+LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))

 ifeq ($(GGML_CUDA),1)
 	LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
@ -29,8 +29,10 @@ endif
 all: clean whisper examples

 whisper: mkdir
-	@echo Build whisper
-	@${MAKE} -C ../.. libwhisper.a
+	cmake -S ../.. -B ../../${BUILD_DIR} \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DBUILD_SHARED_LIBS=OFF
+	cmake --build ../../${BUILD_DIR} --target whisper

 test: model-small whisper modtidy
 ifeq ($(UNAME_S),Darwin)
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -31,7 +31,7 @@ func main() {
 	if err != nil {
 		panic(err)
 	}
-	if err := context.Process(samples, nil, nil); err != nil {
+	if err := context.Process(samples, nil, nil, nil); err != nil {
 		return err
 	}

--- a/bindings/go/examples/go-whisper/process.go
+++ b/bindings/go/examples/go-whisper/process.go
@ -67,7 +67,7 @@ func Process(model whisper.Model, path string, flags *Flags) error {
 	// Process the data
 	fmt.Fprintf(flags.Output(), "  ...processing %q\n", path)
 	context.ResetTimings()
-	if err := context.Process(data, cb, nil); err != nil {
+	if err := context.Process(data, nil, cb, nil); err != nil {
 		return err
 	}

--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -71,6 +71,10 @@ func (context *context) Language() string {
 	return whisper.Whisper_lang_str(context.params.Language())
 }

+func (context *context) DetectedLanguage() string {
+	return whisper.Whisper_lang_str(context.model.ctx.Whisper_full_lang_id())
+}
+
 // Set translate flag
 func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
@ -189,6 +193,7 @@ func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]f
 // Process new sample data and return any errors
 func (context *context) Process(
 	data []float32,
+	callEncoderBegin EncoderBeginCallback,
 	callNewSegment SegmentCallback,
 	callProgress ProgressCallback,
 ) error {
@ -203,7 +208,20 @@ func (context *context) Process(
 	// We don't do parallel processing at the moment
 	processors := 0
 	if processors > 1 {
-		if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
+		if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, callEncoderBegin,
+			func(new int) {
+				if callNewSegment != nil {
+					num_segments := context.model.ctx.Whisper_full_n_segments()
+					s0 := num_segments - new
+					for i := s0; i < num_segments; i++ {
+						callNewSegment(toSegment(context.model.ctx, i))
+					}
+				}
+			}); err != nil {
+			return err
+		}
+	} else if err := context.model.ctx.Whisper_full(context.params, data, callEncoderBegin,
+		func(new int) {
 			if callNewSegment != nil {
 				num_segments := context.model.ctx.Whisper_full_n_segments()
 				s0 := num_segments - new
@ -211,22 +229,11 @@ func (context *context) Process(
 					callNewSegment(toSegment(context.model.ctx, i))
 				}
 			}
-		}); err != nil {
-			return err
-		}
-	} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
-		if callNewSegment != nil {
-			num_segments := context.model.ctx.Whisper_full_n_segments()
-			s0 := num_segments - new
-			for i := s0; i < num_segments; i++ {
-				callNewSegment(toSegment(context.model.ctx, i))
+		}, func(progress int) {
+			if callProgress != nil {
+				callProgress(progress)
 			}
-		}
-	}, func(progress int) {
-		if callProgress != nil {
-			callProgress(progress)
-		}
-	}); err != nil {
+		}); err != nil {
 		return err
 	}

--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@ -88,6 +88,37 @@ func TestProcess(t *testing.T) {
 	context, err := model.NewContext()
 	assert.NoError(err)

-	err = context.Process(data, nil, nil)
+	err = context.Process(data, nil, nil, nil)
 	assert.NoError(err)
 }
+
+func TestDetectedLanguage(t *testing.T) {
+	assert := assert.New(t)
+
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer fh.Close()
+
+	// Decode the WAV file - load the full buffer
+	dec := wav.NewDecoder(fh)
+	buf, err := dec.FullPCMBuffer()
+	assert.NoError(err)
+	assert.Equal(uint16(1), dec.NumChans)
+
+	data := buf.AsFloat32Buffer().Data
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	context, err := model.NewContext()
+	assert.NoError(err)
+
+	err = context.Process(data, nil, nil, nil)
+	assert.NoError(err)
+
+	expectedLanguage := "en"
+	actualLanguage := context.DetectedLanguage()
+	assert.Equal(expectedLanguage, actualLanguage)
+}
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -16,6 +16,10 @@ type SegmentCallback func(Segment)
 // processing. It is called during the Process function
 type ProgressCallback func(int)

+// EncoderBeginCallback is the callback function for checking if we want to
+// continue processing. It is called during the Process function
+type EncoderBeginCallback func() bool
+
 // Model is the interface to a whisper model. Create a new model with the
 // function whisper.New(string)
 type Model interface {
@ -31,12 +35,13 @@ type Model interface {
 	Languages() []string
 }

-// Context is the speach recognition context.
+// Context is the speech recognition context.
 type Context interface {
 	SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
 	SetTranslate(bool)        // Set translate flag
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
+	DetectedLanguage() string // Get detected language

 	SetOffset(time.Duration)          // Set offset
 	SetDuration(time.Duration)        // Set duration
@ -58,7 +63,7 @@ type Context interface {
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
 	// callback function during processing.
-	Process([]float32, SegmentCallback, ProgressCallback) error
+	Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error

 	// After process is called, return segments until the end of the stream
 	// is reached, when io.EOF is returned.
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -9,7 +9,7 @@ import (
 // CGO

 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
+#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu  -lm -lstdc++ -fopenmp
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/java/build.gradle
+++ b/bindings/java/build.gradle
@ -25,13 +25,13 @@ sourceSets {
 }

 tasks.register('copyLibwhisperDynlib', Copy) {
-    from '../../build'
-    include 'libwhisper.dynlib'
+    from '../../build/src'
+    include 'libwhisper.dylib'
    into 'build/generated/resources/main/darwin'
 }

 tasks.register('copyLibwhisperSo', Copy) {
-    from '../../build'
+    from '../../build/src'
    include 'libwhisper.so'
    into 'build/generated/resources/main/linux-x86-64'
 }
@ -55,7 +55,12 @@ java {
    withJavadocJar()
 }

+sourcesJar() {
+    dependsOn copyLibs
+}
+
 jar {
+    dependsOn copyLibs
    exclude '**/whisper_java.exp', '**/whisper_java.lib'
 }

@ -67,6 +72,9 @@ tasks.withType(Test) {
    useJUnitPlatform()
 }

+test.dependsOn copyLibs
+processResources.dependsOn copyLibs
+
 dependencies {
    implementation "net.java.dev.jna:jna:5.13.0"
    testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
--- a/bindings/java/gradlew
+++ b/bindings/java/gradlew
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperConstants.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperConstants.java
@ -0,0 +1,24 @@
+package io.github.ggerganov.whispercpp;
+
+/**
+ * Presets for alignment heads in DTW token timestamps
+ */
+public class WhisperConstants {
+    // Alignment heads presets
+    public static final int WHISPER_AHEADS_NONE = 0;
+    public static final int WHISPER_AHEADS_TINY_EN = 1;
+    public static final int WHISPER_AHEADS_TINY = 2;
+    public static final int WHISPER_AHEADS_BASE_EN = 3;
+    public static final int WHISPER_AHEADS_BASE = 4;
+    public static final int WHISPER_AHEADS_SMALL_EN = 5;
+    public static final int WHISPER_AHEADS_SMALL = 6;
+    public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
+    public static final int WHISPER_AHEADS_MEDIUM = 8;
+    public static final int WHISPER_AHEADS_LARGE_V1 = 9;
+    public static final int WHISPER_AHEADS_LARGE_V2 = 10;
+    public static final int WHISPER_AHEADS_LARGE_V3 = 11;
+    public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
+    public static final int WHISPER_AHEADS_CUSTOM = 13;
+    public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
+    public static final int WHISPER_AHEADS_COUNT = 15;
+}
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java
@ -1,7 +1,9 @@
 package io.github.ggerganov.whispercpp;

+import com.sun.jna.NativeLong;
 import com.sun.jna.Structure;
 import com.sun.jna.ptr.PointerByReference;
+import com.sun.jna.Pointer;
 import io.github.ggerganov.whispercpp.ggml.GgmlType;
 import io.github.ggerganov.whispercpp.WhisperModel;
 import io.github.ggerganov.whispercpp.params.WhisperContextParams;
@ -9,33 +11,26 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 import java.util.List;

 public class WhisperContext extends Structure {
-    int t_load_us = 0;
-    int t_start_us = 0;
+    public NativeLong t_load_us;
+    public NativeLong t_start_us;

    /** weight type (FP32 / FP16 / QX) */
-    GgmlType wtype = GgmlType.GGML_TYPE_F16;
+    public GgmlType wtype = GgmlType.GGML_TYPE_F16;
    /** intermediate type (FP32 or FP16) */
-    GgmlType itype = GgmlType.GGML_TYPE_F16;
+    public GgmlType itype = GgmlType.GGML_TYPE_F16;

-//    WhisperModel model;
-    public PointerByReference model;
-//    whisper_vocab vocab;
-//    whisper_state * state = nullptr;
-    public PointerByReference vocab;
-    public PointerByReference state;
+    public WhisperContextParams.ByValue params;
+
+    public Pointer model;
+    public Pointer vocab;
+    public Pointer state;

    /** populated by whisper_init_from_file_with_params() */
-    String path_model;
-    WhisperContextParams params;
+    public Pointer path_model;

-//    public static class ByReference extends WhisperContext implements Structure.ByReference {
-//    }
-//
-//    public static class ByValue extends WhisperContext implements Structure.ByValue {
-//    }
-//
-//    @Override
-//    protected List<String> getFieldOrder() {
-//        return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
-//    }
+    @Override
+    protected List<String> getFieldOrder() {
+        return List.of("t_load_us", "t_start_us", "wtype", "itype",
+                "params", "model", "vocab", "state", "path_model");
+    }
 }
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
@ -43,11 +43,11 @@ public class WhisperCpp implements AutoCloseable {
     * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
     * @param params - params to use when initialising the context
     */
-    public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
+    public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
        initContextImpl(modelPath, params);
    }

-    private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
+    private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
        if (ctx != null) {
            lib.whisper_free(ctx);
        }
@ -69,15 +69,13 @@ public class WhisperCpp implements AutoCloseable {

    /**
     * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
-     * Because this function allocates memory for the params, the caller must call either:
-     * - call `whisper_free_context_params()`
-     * - `Native.free(Pointer.nativeValue(pointer));`
+     * Returns a ByValue instance to ensure proper parameter passing to native code.
     */
-    public WhisperContextParams getContextDefaultParams() {
-        paramsPointer = lib.whisper_context_default_params_by_ref();
-        WhisperContextParams params = new WhisperContextParams(paramsPointer);
-        params.read();
-        return params;
+    public WhisperContextParams.ByValue getContextDefaultParams() {
+        WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
+            lib.whisper_context_default_params_by_ref());
+        valueParams.read();
+        return valueParams;
    }
    
    /**
@ -88,7 +86,7 @@ public class WhisperCpp implements AutoCloseable {
     *
     * @param strategy - GREEDY
     */
-    public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
+    public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
        Pointer pointer;

        // whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
@ -104,7 +102,7 @@ public class WhisperCpp implements AutoCloseable {
            pointer = beamParamsPointer;
        }

-        WhisperFullParams params = new WhisperFullParams(pointer);
+        WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
        params.read();
        return params;
    }
@ -138,15 +136,21 @@ public class WhisperCpp implements AutoCloseable {
    }

    /**
-     * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
+     * Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text.
     * Not thread safe for same context
     * Uses the specified decoding strategy to obtain the text.
     */
-    public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
+    public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
        if (ctx == null) {
            throw new IllegalStateException("Model not initialised");
        }

+        /*
+        WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
+            lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
+        valueParams.read();
+        */
+
        if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
            throw new IOException("Failed to process audio");
        }
@ -163,12 +167,17 @@ public class WhisperCpp implements AutoCloseable {

        return str.toString().trim();
    }
+
    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
        if (ctx == null) {
            throw new IllegalStateException("Model not initialised");
        }

-        if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
+        WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
+            lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
+        valueParams.read();
+
+        if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
            throw new IOException("Failed to process audio");
        }

--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@ -38,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
     * @param params     Pointer to whisper_context_params
     * @return Whisper context on success, null on failure
     */
-    Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
+    Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);

    /**
     * Allocate (almost) all memory needed for the model by loading from a buffer.
@ -180,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
    /**
     * @return the id of the specified language, returns -1 if not found.
     * Examples:
-     *   "de" -> 2
-     *   "german" -> 2
+     *   "de" -&gt; 2
+     *   "german" -&gt; 2
     */
    int whisper_lang_id(String lang);

-    /** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
+    /** @return the short string of the specified language id (e.g. 2 -&gt; "de"), returns nullptr if not found */
    String whisper_lang_str(int id);

    /**
@ -268,20 +268,21 @@ public interface WhisperCppJnaLibrary extends Library {
    void whisper_free_params(Pointer params);

    /**
-     * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+     * Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text
     * Not thread safe for same context
     * Uses the specified decoding strategy to obtain the text.
     */
-    int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
+    int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);

-    int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
+    public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
+    //int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);

    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
    // Result is stored in the default state of the context
    // Not thread safe if executed in parallel on the same context.
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
-    int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
+    int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);

    /**
     * Number of generated text segments.
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/callbacks/GgmlAbortCallback.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/callbacks/GgmlAbortCallback.java
@ -0,0 +1,17 @@
+package io.github.ggerganov.whispercpp.callbacks;
+
+import com.sun.jna.Callback;
+
+/**
+ * Callback for aborting GGML computation
+ * Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
+ */
+public interface GgmlAbortCallback extends Callback {
+    /**
+     * Return true to abort the computation, false to continue
+     *
+     * @param data User data passed to the callback
+     * @return true to abort, false to continue
+     */
+    boolean invoke(com.sun.jna.Pointer data);
+}
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAhead.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAhead.java
@ -0,0 +1,30 @@
+package io.github.ggerganov.whispercpp.params;
+import com.sun.jna.*;
+import java.util.Arrays;
+import java.util.List;
+
+public class WhisperAhead extends Structure {
+
+    public int n_text_layer;
+
+    public int n_head;
+
+    public WhisperAhead() {
+        super();
+    }
+
+    public WhisperAhead(int textLayer, int head) {
+        super();
+        this.n_text_layer = textLayer;
+        this.n_head = head;
+    }
+
+    @Override
+    protected List<String> getFieldOrder() {
+        return Arrays.asList("n_text_layer", "n_head");
+    }
+
+    public static class ByReference extends WhisperAhead implements Structure.ByReference {}
+
+    public static class ByValue extends WhisperAhead implements Structure.ByValue {}
+}
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAheads.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAheads.java
@ -0,0 +1,41 @@
+package io.github.ggerganov.whispercpp.params;
+import com.sun.jna.*;
+import java.util.Arrays;
+import java.util.List;
+
+public class WhisperAheads extends Structure {
+    public NativeLong n_heads;
+
+    public Pointer heads;
+
+    public WhisperAheads() {
+        super();
+    }
+
+    /**
+     * Create alignment heads from an array of WhisperAhead objects
+     */
+    public void setHeads(WhisperAhead[] aheadsArray) {
+        this.n_heads = new NativeLong(aheadsArray.length);
+
+        int structSize = aheadsArray[0].size();
+        Memory mem = new Memory(structSize * aheadsArray.length);
+
+        for (int i = 0; i < aheadsArray.length; i++) {
+            aheadsArray[i].write();
+            byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
+            mem.write(i * structSize, buffer, 0, buffer.length);
+        }
+
+        this.heads = mem;
+    }
+
+    @Override
+    protected List<String> getFieldOrder() {
+        return Arrays.asList("n_heads", "heads");
+    }
+
+    public static class ByReference extends WhisperAheads implements Structure.ByReference {}
+
+    public static class ByValue extends WhisperAheads implements Structure.ByValue {}
+}
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java
@ -1,7 +1,5 @@
 package io.github.ggerganov.whispercpp.params;
-
 import com.sun.jna.*;
-
 import java.util.Arrays;
 import java.util.List;

@ -11,21 +9,73 @@ import java.util.List;
 * whisper_context_default_params()
 */
 public class WhisperContextParams extends Structure {
-
    public WhisperContextParams(Pointer p) {
        super(p);
    }

-    /** Use GPU for inference Number (default = true) */
+    public WhisperContextParams() {
+        super();
+    }
+
+    /** Use GPU for inference (default = true) */
    public CBool use_gpu;

-    /** Use GPU for inference Number (default = true) */
+    /** Use flash attention (default = false) */
+    public CBool flash_attn;
+
+    /** CUDA device to use (default = 0) */
+    public int gpu_device;
+
+    /** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
+    public CBool dtw_token_timestamps;
+
+    /** [EXPERIMENTAL] Alignment heads preset for DTW */
+    public int dtw_aheads_preset;
+
+    /** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
+    public int dtw_n_top;
+
+    public WhisperAheads.ByValue dtw_aheads;
+
+    /** DTW memory size (internal use) */
+    public NativeLong dtw_mem_size;
+
+    /** Use GPU for inference */
    public void useGpu(boolean enable) {
        use_gpu = enable ? CBool.TRUE : CBool.FALSE;
    }

+    /** Use flash attention */
+    public void useFlashAttn(boolean enable) {
+        flash_attn = enable ? CBool.TRUE : CBool.FALSE;
+    }
+
+    /** Enable DTW token-level timestamps */
+    public void enableDtwTokenTimestamps(boolean enable) {
+        dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
+    }
+
+    /** Set DTW alignment heads preset */
+    public void setDtwAheadsPreset(int preset) {
+        dtw_aheads_preset = preset;
+    }
+
    @Override
    protected List<String> getFieldOrder() {
-        return Arrays.asList("use_gpu");
+        return Arrays.asList(
+            "use_gpu",
+            "flash_attn",
+            "gpu_device",
+            "dtw_token_timestamps",
+            "dtw_aheads_preset",
+            "dtw_n_top",
+            "dtw_aheads",
+            "dtw_mem_size"
+        );
+    }
+
+    public static class ByValue extends WhisperContextParams implements Structure.ByValue {
+        public ByValue() { super(); }
+        public ByValue(Pointer p) { super(p); }
    }
 }
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@ -5,6 +5,7 @@ import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
 import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
 import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
 import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
+import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;

 import java.util.Arrays;
 import java.util.List;
@ -16,10 +17,12 @@ import java.util.List;
 */
 public class WhisperFullParams extends Structure {

+    public WhisperFullParams() {
+        super();
+    }
+
    public WhisperFullParams(Pointer p) {
        super(p);
-//        super(p, ALIGN_MSVC);
-//        super(p, ALIGN_GNUC);
    }

    /** Sampling strategy for whisper_full() function. */
@ -69,10 +72,10 @@ public class WhisperFullParams extends Structure {
        single_segment = single ? CBool.TRUE : CBool.FALSE;
    }

-    /** Flag to print special tokens (e.g., &lt;SOT>, &lt;EOT>, &lt;BEG>, etc.). (default = false) */
+    /** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
    public CBool print_special;

-    /** Flag to print special tokens (e.g., &lt;SOT>, &lt;EOT>, &lt;BEG>, etc.). (default = false) */
+    /** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
    public void printSpecial(boolean enable) {
        print_special = enable ? CBool.TRUE : CBool.FALSE;
    }
@ -129,6 +132,14 @@ public class WhisperFullParams extends Structure {
    /** Maximum tokens per segment (0, default = no limit) */
    public int max_tokens;

+    /** [EXPERIMENTAL] Enable debug mode for extra info */
+    public CBool debug_mode;
+
+    /** Enable debug mode */
+    public void enableDebugMode(boolean enable) {
+        debug_mode = enable ? CBool.TRUE : CBool.FALSE;
+    }
+
    /** Overwrite the audio context size (0 = use default). */
    public int audio_ctx;

@ -274,6 +285,16 @@ public class WhisperFullParams extends Structure {
     */
    public Pointer encoder_begin_callback_user_data;

+    /** Callback used to abort GGML computation */
+    public Pointer abort_callback;
+
+    /** User data for the abort_callback */
+    public Pointer abort_callback_user_data;
+
+    public void setAbortCallback(GgmlAbortCallback callback) {
+        abort_callback = CallbackReference.getFunctionPointer(callback);
+    }
+
    /**
     * Callback by each decoder to filter obtained logits.
     * WhisperLogitsFilterCallback
@ -310,17 +331,28 @@ public class WhisperFullParams extends Structure {

    @Override
    protected List<String> getFieldOrder() {
-        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
-                "no_context", "single_segment", "no_timestamps",
-                "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
-                "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
-                "suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",
-                "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
-                "new_segment_callback", "new_segment_callback_user_data",
+        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
+                "offset_ms", "duration_ms", "translate", "no_context",
+                "no_timestamps", "single_segment", "print_special",
+                "print_progress", "print_realtime", "print_timestamps",
+                "token_timestamps", "thold_pt", "thold_ptsum", "max_len",
+                "split_on_word", "max_tokens", "debug_mode", "audio_ctx", 
+                "tdrz_enable", "suppress_regex", "initial_prompt",
+                "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
+                "suppress_blank", "suppress_nst", "temperature",
+                "max_initial_ts", "length_penalty", "temperature_inc",
+                "entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
+                "beam_search", "new_segment_callback", "new_segment_callback_user_data",
                "progress_callback", "progress_callback_user_data",
                "encoder_begin_callback", "encoder_begin_callback_user_data",
+                "abort_callback", "abort_callback_user_data",
                "logits_filter_callback", "logits_filter_callback_user_data",
                "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
    }
+
+    public static class ByValue extends WhisperFullParams implements Structure.ByValue {
+        public ByValue() { super(); }
+        public ByValue(Pointer p) { super(p); }
+    }
+
 }
--- a/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
+++ b/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
@ -76,7 +76,7 @@ class WhisperCppTest {
        float[] floats = new float[b.length / 2];

        //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
+        WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
        params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
        params.print_progress = CBool.FALSE;
        //params.initial_prompt = "and so my fellow Americans um, like";
--- a/bindings/javascript/README.md
+++ b/bindings/javascript/README.md
@ -33,6 +33,9 @@ mkdir build-em && cd build-em
 emcmake cmake .. && make -j

 # run test
+node ../tests/test-whisper.js
+
+# For Node.js versions prior to v16.4.0, experimental features need to be enabled:
 node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js

 # publish npm package
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.7.4",
+  "version": "1.7.5",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/cpu.mk
+++ b/bindings/ruby/ext/cpu.mk
@ -1,5 +1,7 @@
 ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
 	ggml/src/ggml-cpu/ggml-cpu.cpp \
+	ggml/src/ggml-cpu/unary-ops.cpp \
+	ggml/src/ggml-cpu/binary-ops.cpp \
 	ggml/include/ggml-backend.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -168,7 +168,9 @@ $OBJ_GGML <<
  'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
  'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
  'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
-  'ggml/src/ggml-cpu/ggml-cpu-traits.o'
+  'ggml/src/ggml-cpu/ggml-cpu-traits.o' <<
+  'ggml/src/ggml-cpu/unary-ops.o' <<
+  'ggml/src/ggml-cpu/binary-ops.o'

 $OBJ_WHISPER <<
  'src/whisper.o' <<
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
@ -25,7 +25,7 @@ class TestCallback < TestBase
        assert start_time >= 0
        assert_kind_of Integer, end_time
        assert end_time > 0
-        assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
+        assert_match(/ask not what your country can do for you, ask what you can do for your country/, text) if i_segment == 0
      end
    }

@ -145,9 +145,9 @@ class TestCallback < TestBase

  def test_abort_on
    do_abort = false
-    aborted_from_callback = false
+    _aborted_from_callback = false
    @params.on_new_segment do |segment|
-      do_abort = true if segment.text.match? /ask/
+      do_abort = true if segment.text.match?(/ask/)
    end
    i = 0
    @params.abort_on do
--- a/bindings/ruby/tests/test_error.rb
+++ b/bindings/ruby/tests/test_error.rb
@ -4,7 +4,7 @@ class TestError < TestBase
  def test_error
    error = Whisper::Error.new(-2)
    assert_equal "failed to compute log mel spectrogram", error.message
-    assert_equal -2, error.code
+    assert_equal(-2, error.code)
  end

  def test_unknown_error
@ -14,7 +14,7 @@ class TestError < TestBase

  def test_non_int_code
    assert_raise TypeError do
-      error = Whisper::Error.new("non int")
+      _error = Whisper::Error.new("non int")
    end
  end
 end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -162,7 +162,7 @@ class TestParams < TestBase
  end

  def test_length_penalty
-    assert_equal -1.0, @params.length_penalty
+    assert_equal(-1.0, @params.length_penalty)
    @params.length_penalty = 0.5
    assert_equal 0.5, @params.length_penalty
  end
@ -180,9 +180,9 @@ class TestParams < TestBase
  end

  def test_logprob_thold
-    assert_in_delta -1.0, @params.logprob_thold
+    assert_in_delta(-1.0, @params.logprob_thold)
    @params.logprob_thold = -0.5
-    assert_in_delta -0.5, @params.logprob_thold
+    assert_in_delta(-0.5, @params.logprob_thold)
  end

  def test_no_speech_thold
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -49,13 +49,13 @@ class TestSegment < TestBase
      if index == 0
        seg = segment
        assert_equal 0, segment.start_time
-        assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
+        assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
      end
      index += 1
    end
    whisper.transcribe(AUDIO, params)
    assert_equal 0, seg.start_time
-    assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
  end

  def test_on_new_segment_twice
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -16,7 +16,7 @@ class TestWhisper < TestBase
    params.print_timestamps = false

    @whisper.transcribe(AUDIO, params) {|text|
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
    }
  end

@ -32,7 +32,7 @@ class TestWhisper < TestBase
    def test_full_get_segment
      segment = whisper.full_get_segment(0)
      assert_equal 0, segment.start_time
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
    end

    def test_full_get_segment_t0
@ -59,7 +59,7 @@ class TestWhisper < TestBase
    end

    def test_full_get_segment_text
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0))
    end

    def test_full_get_segment_no_speech_prob
@ -134,14 +134,14 @@ class TestWhisper < TestBase
      @whisper.full(@params, @samples, @samples.length)

      assert_equal 1, @whisper.full_n_segments
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
    end

    def test_full_without_length
      @whisper.full(@params, @samples)

      assert_equal 1, @whisper.full_n_segments
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
    end

    def test_full_enumerator
@ -149,7 +149,7 @@ class TestWhisper < TestBase
      @whisper.full(@params, samples, @samples.length)

      assert_equal 1, @whisper.full_n_segments
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
    end

    def test_full_enumerator_without_length
@ -171,26 +171,28 @@ class TestWhisper < TestBase
      @whisper.full(@params, samples)

      assert_equal 1, @whisper.full_n_segments
-      assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
    end

    def test_full_parallel
-      @whisper.full_parallel(@params, @samples, @samples.length, Etc.nprocessors)
+      nprocessors = 2
+      @whisper.full_parallel(@params, @samples, @samples.length, nprocessors)

-      assert_equal Etc.nprocessors, @whisper.full_n_segments
+      assert_equal nprocessors, @whisper.full_n_segments
      text = @whisper.each_segment.collect(&:text).join
-      assert_match /ask what you can do/i, text
-      assert_match /for your country/i, text
+      assert_match(/ask what you can do/i, text)
+      assert_match(/for your country/i, text)
    end

    def test_full_parallel_with_memory_view
+      nprocessors = 2
      samples = JFKReader.new(AUDIO)
-      @whisper.full_parallel(@params, samples, nil, Etc.nprocessors)
+      @whisper.full_parallel(@params, samples, nil, nprocessors)

-      assert_equal Etc.nprocessors, @whisper.full_n_segments
+      assert_equal nprocessors, @whisper.full_n_segments
      text = @whisper.each_segment.collect(&:text).join
-      assert_match /ask what you can do/i, text
-      assert_match /for your country/i, text
+      assert_match(/ask what you can do/i, text)
+      assert_match(/for your country/i, text)
    end

    def test_full_parallel_without_length_and_n_processors
@ -198,17 +200,18 @@ class TestWhisper < TestBase

      assert_equal 1, @whisper.full_n_segments
      text = @whisper.each_segment.collect(&:text).join
-      assert_match /ask what you can do/i, text
-      assert_match /for your country/i, text
+      assert_match(/ask what you can do/i, text)
+      assert_match(/for your country/i, text)
    end

    def test_full_parallel_without_length
-      @whisper.full_parallel(@params, @samples, nil, Etc.nprocessors)
+      nprocessors = 2
+      @whisper.full_parallel(@params, @samples, nil, nprocessors)

-      assert_equal Etc.nprocessors, @whisper.full_n_segments
+      assert_equal nprocessors, @whisper.full_n_segments
      text = @whisper.each_segment.collect(&:text).join
-      assert_match /ask what you can do/i, text
-      assert_match /for your country/i, text
+      assert_match(/ask what you can do/i, text)
+      assert_match(/for your country/i, text)
    end

    def test_full_parallel_without_n_processors
@ -216,8 +219,8 @@ class TestWhisper < TestBase

      assert_equal 1, @whisper.full_n_segments
      text = @whisper.each_segment.collect(&:text).join
-      assert_match /ask what you can do/i, text
-      assert_match /for your country/i, text
+      assert_match(/ask what you can do/i, text)
+      assert_match(/for your country/i, text)
    end
  end
 end
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@ -108,7 +108,7 @@ setup_framework_structure() {
    fi

    # Copy all required headers (common for all platforms)
-    cp include/whisper.h             ${header_path}
+    cp include/whisper.h           ${header_path}
    cp ggml/include/ggml.h         ${header_path}
    cp ggml/include/ggml-alloc.h   ${header_path}
    cp ggml/include/ggml-backend.h ${header_path}
@ -245,9 +245,16 @@ combine_static_libraries() {
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
    )
+    if [[ "$platform" == "macos" || "$platform" == "ios" ]]; then
+        echo "Adding libwhisper.coreml library to the build."
+        libs+=(
+            "${base_dir}/${build_dir}/src/${release_dir}/libwhisper.coreml.a"
+        )
+    fi

    # Create temporary directory for processing
    local temp_dir="${base_dir}/${build_dir}/temp"
+    echo "Creating temporary directory: ${temp_dir}"
    mkdir -p "${temp_dir}"

    # Since we have multiple architectures libtool will find object files that do not
@ -259,6 +266,7 @@ combine_static_libraries() {
    local archs=""
    local min_version_flag=""
    local install_name=""
+    local frameworks="-framework Foundation -framework Metal -framework Accelerate"

    case "$platform" in
        "ios")
@ -272,12 +280,14 @@ combine_static_libraries() {
                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
            fi
            install_name="@rpath/whisper.framework/whisper"
+            frameworks+=" -framework CoreML"
            ;;
        "macos")
            sdk="macosx"
            archs="arm64 x86_64"
            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
            install_name="@rpath/whisper.framework/Versions/Current/whisper"
+            frameworks+=" -framework CoreML"
            ;;
        "visionos")
            if [[ "$is_simulator" == "true" ]]; then
@ -319,7 +329,7 @@ combine_static_libraries() {
        $arch_flags \
        $min_version_flag \
        -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
+        $frameworks \
        -install_name "$install_name" \
        -o "${base_dir}/${output_lib}"

@ -399,6 +409,8 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DWHISPER_COREML="ON" \
+    -DWHISPER_COREML_ALLOW_FALLBACK="ON" \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet

@ -411,6 +423,8 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DWHISPER_COREML="ON" \
+    -DWHISPER_COREML_ALLOW_FALLBACK="ON" \
    -S .
 cmake --build build-ios-device --config Release -- -quiet

@ -421,6 +435,8 @@ cmake -B build-macos -G Xcode \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DWHISPER_COREML="ON" \
+    -DWHISPER_COREML_ALLOW_FALLBACK="ON" \
    -S .
 cmake --build build-macos --config Release -- -quiet

@ -432,8 +448,8 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@ -445,8 +461,8 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

--- a/ci/run.sh
+++ b/ci/run.sh
@ -10,6 +10,8 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -324,8 +326,9 @@ ret=0
 for model in "${MODELS[@]}"; do
    test $ret -eq 0 && gg_download_model ${model}
 done
-
-test $ret -eq 0 && gg_run ctest debug
+if [ -z ${GG_BUILD_SYCL}]; then
+    test $ret -eq 0 && gg_run ctest debug
+fi
 test $ret -eq 0 && gg_run ctest release

 test $ret -eq 0 && gg_run bench
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -18,6 +18,7 @@ const whisperParamsMock = {
  translate: true,
  no_timestamps: false,
  audio_ctx: 0,
+  max_len: 0,
 };

 describe("Run whisper.node", () => {
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -128,192 +128,227 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper

 void cb_log_disable(enum ggml_log_level, const char *, void *) {}

-int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
-    if (params.no_prints) {
-        whisper_log_set(cb_log_disable, NULL);
-    }
-
-    if (params.fname_inp.empty() && params.pcmf32.empty()) {
-        fprintf(stderr, "error: no input files or audio buffer specified\n");
-        return 2;
-    }
-
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
-    cparams.flash_attn = params.flash_attn;
-    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-
-    if (ctx == nullptr) {
-        fprintf(stderr, "error: failed to initialize whisper context\n");
-        return 3;
-    }
-
-    // if params.pcmf32 is provided, set params.fname_inp to "buffer"
-    // this is simpler than further modifications in the code
-    if (!params.pcmf32.empty()) {
-        fprintf(stderr, "info: using audio buffer as input\n");
-        params.fname_inp.clear();
-        params.fname_inp.emplace_back("buffer");
-    }
-
-    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
-        const auto fname_inp = params.fname_inp[f];
-        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
-
-        std::vector<float> pcmf32; // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-        // read the input audio file if params.pcmf32 is not provided
-        if (params.pcmf32.empty()) {
-            if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-                fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
-                continue;
-            }
-        } else {
-            pcmf32 = params.pcmf32;
-        }
-
-        // print system information
-        if (!params.no_prints) {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-        }
-
-        // print some info about the processing
-        if (!params.no_prints) {
-            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
-            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
-                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.no_timestamps ? 0 : 1,
-                    params.audio_ctx);
-
-            fprintf(stderr, "\n");
-        }
-
-        // run the inference
-        {
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-
-            wparams.token_timestamps = params.output_wts || params.max_len > 0;
-            wparams.thold_pt         = params.word_thold;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.audio_ctx        = params.audio_ctx;
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.initial_prompt   = params.prompt.c_str();
-
-            wparams.no_timestamps    = params.no_timestamps;
-
-            whisper_print_user_data user_data = { &params, &pcmf32s };
-
-            // this callback is called on each new segment
-            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
-            }
-
-            // example for abort mechanism
-            // in this example, we do not abort the processing, but we could if the flag is set to true
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                fprintf(stderr, "failed to process audio\n");
-                return 10;
-            }
-        }
-    }
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    result.resize(n_segments);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
-        result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
-        result[i].emplace_back(text);
-    }
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}
-
-class Worker : public Napi::AsyncWorker {
+class ProgressWorker : public Napi::AsyncWorker {
 public:
-  Worker(Napi::Function& callback, whisper_params params)
-      : Napi::AsyncWorker(callback), params(params) {}
-
-  void Execute() override {
-    run(params, result);
-  }
-
-  void OnOK() override {
-    Napi::HandleScope scope(Env());
-    Napi::Object res = Napi::Array::New(Env(), result.size());
-    for (uint64_t i = 0; i < result.size(); ++i) {
-      Napi::Object tmp = Napi::Array::New(Env(), 3);
-      for (uint64_t j = 0; j < 3; ++j) {
-        tmp[j] = Napi::String::New(Env(), result[i][j]);
-      }
-      res[i] = tmp;
+    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
+        : Napi::AsyncWorker(callback), params(params), env(env) {
+        // Create thread-safe function
+        if (!progress_callback.IsEmpty()) {
+            tsfn = Napi::ThreadSafeFunction::New(
+                env,
+                progress_callback,
+                "Progress Callback",
+                0,
+                1
+            );
+        }
+    }
+
+    ~ProgressWorker() {
+        if (tsfn) {
+            // Make sure to release the thread-safe function on destruction
+            tsfn.Release();
+        }
+    }
+
+    void Execute() override {
+        // Use custom run function with progress callback support
+        run_with_progress(params, result);
+    }
+
+    void OnOK() override {
+        Napi::HandleScope scope(Env());
+        Napi::Object res = Napi::Array::New(Env(), result.size());
+        for (uint64_t i = 0; i < result.size(); ++i) {
+            Napi::Object tmp = Napi::Array::New(Env(), 3);
+            for (uint64_t j = 0; j < 3; ++j) {
+                tmp[j] = Napi::String::New(Env(), result[i][j]);
+            }
+            res[i] = tmp;
+        }
+        Callback().Call({Env().Null(), res});
+    }
+
+    // Progress callback function - using thread-safe function
+    void OnProgress(int progress) {
+        if (tsfn) {
+            // Use thread-safe function to call JavaScript callback
+            auto callback = [progress](Napi::Env env, Napi::Function jsCallback) {
+                jsCallback.Call({Napi::Number::New(env, progress)});
+            };
+            
+            tsfn.BlockingCall(callback);
+        }
    }
-    Callback().Call({Env().Null(), res});
-  }

 private:
-  whisper_params params;
-  std::vector<std::vector<std::string>> result;
+    whisper_params params;
+    std::vector<std::vector<std::string>> result;
+    Napi::Env env;
+    Napi::ThreadSafeFunction tsfn;
+
+    // Custom run function with progress callback support
+    int run_with_progress(whisper_params &params, std::vector<std::vector<std::string>> &result) {
+        if (params.no_prints) {
+            whisper_log_set(cb_log_disable, NULL);
+        }
+
+        if (params.fname_inp.empty() && params.pcmf32.empty()) {
+            fprintf(stderr, "error: no input files or audio buffer specified\n");
+            return 2;
+        }
+
+        if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
+            fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+            exit(0);
+        }
+
+        // whisper init
+        struct whisper_context_params cparams = whisper_context_default_params();
+        cparams.use_gpu = params.use_gpu;
+        cparams.flash_attn = params.flash_attn;
+        struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+
+        if (ctx == nullptr) {
+            fprintf(stderr, "error: failed to initialize whisper context\n");
+            return 3;
+        }
+
+        // If params.pcmf32 provides, set params.fname_inp as "buffer"
+        if (!params.pcmf32.empty()) {
+            fprintf(stderr, "info: using audio buffer as input\n");
+            params.fname_inp.clear();
+            params.fname_inp.emplace_back("buffer");
+        }
+
+        for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
+            const auto fname_inp = params.fname_inp[f];
+            const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+
+            std::vector<float> pcmf32; // mono-channel F32 PCM
+            std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
+            // If params.pcmf32 is empty, read input audio file
+            if (params.pcmf32.empty()) {
+                if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
+                    fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
+                    continue;
+                }
+            } else {
+                pcmf32 = params.pcmf32;
+            }
+
+            // Print system info
+            if (!params.no_prints) {
+                fprintf(stderr, "\n");
+                fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                        params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
+            }
+
+            // Print processing info
+            if (!params.no_prints) {
+                fprintf(stderr, "\n");
+                if (!whisper_is_multilingual(ctx)) {
+                    if (params.language != "en" || params.translate) {
+                        params.language = "en";
+                        params.translate = false;
+                        fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                    }
+                }
+                fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
+                        __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
+                        params.n_threads, params.n_processors,
+                        params.language.c_str(),
+                        params.translate ? "translate" : "transcribe",
+                        params.no_timestamps ? 0 : 1,
+                        params.audio_ctx);
+
+                fprintf(stderr, "\n");
+            }
+
+            // Run inference
+            {
+                whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+                wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+
+                wparams.print_realtime   = false;
+                wparams.print_progress   = params.print_progress;
+                wparams.print_timestamps = !params.no_timestamps;
+                wparams.print_special    = params.print_special;
+                wparams.translate        = params.translate;
+                wparams.language         = params.language.c_str();
+                wparams.n_threads        = params.n_threads;
+                wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
+                wparams.offset_ms        = params.offset_t_ms;
+                wparams.duration_ms      = params.duration_ms;
+
+                wparams.token_timestamps = params.output_wts || params.max_len > 0;
+                wparams.thold_pt         = params.word_thold;
+                wparams.entropy_thold    = params.entropy_thold;
+                wparams.logprob_thold    = params.logprob_thold;
+                wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+                wparams.audio_ctx        = params.audio_ctx;
+
+                wparams.greedy.best_of        = params.best_of;
+                wparams.beam_search.beam_size = params.beam_size;
+
+                wparams.initial_prompt   = params.prompt.c_str();
+
+                wparams.no_timestamps    = params.no_timestamps;
+
+                whisper_print_user_data user_data = { &params, &pcmf32s };
+
+                // This callback is called for each new segment
+                if (!wparams.print_realtime) {
+                    wparams.new_segment_callback           = whisper_print_segment_callback;
+                    wparams.new_segment_callback_user_data = &user_data;
+                }
+
+                // Set progress callback
+                wparams.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
+                    ProgressWorker* worker = static_cast<ProgressWorker*>(user_data);
+                    worker->OnProgress(progress);
+                };
+                wparams.progress_callback_user_data = this;
+
+                // Abort mechanism example
+                {
+                    static bool is_aborted = false; // Note: this should be atomic to avoid data races
+
+                    wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                        bool is_aborted = *(bool*)user_data;
+                        return !is_aborted;
+                    };
+                    wparams.encoder_begin_callback_user_data = &is_aborted;
+                }
+
+                if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                    fprintf(stderr, "failed to process audio\n");
+                    return 10;
+                }
+            }
+    }
+
+        const int n_segments = whisper_full_n_segments(ctx);
+        result.resize(n_segments);
+        for (int i = 0; i < n_segments; ++i) {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+            result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
+            result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
+            result[i].emplace_back(text);
+        }
+
+        whisper_print_timings(ctx);
+        whisper_free(ctx);
+
+        return 0;
+    }
 };

-
-
 Napi::Value whisper(const Napi::CallbackInfo& info) {
  Napi::Env env = info.Env();
  if (info.Length() <= 0 || !info[0].IsObject()) {
@ -332,6 +367,23 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
  int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
+  
+  // support prompt
+  std::string prompt = "";
+  if (whisper_params.Has("prompt") && whisper_params.Get("prompt").IsString()) {
+    prompt = whisper_params.Get("prompt").As<Napi::String>();
+  }
+  
+  // Add support for print_progress
+  bool print_progress = false;
+  if (whisper_params.Has("print_progress")) {
+    print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
+  }
+  // Add support for progress_callback
+  Napi::Function progress_callback;
+  if (whisper_params.Has("progress_callback") && whisper_params.Get("progress_callback").IsFunction()) {
+    progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
+  }

  Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
  std::vector<float> pcmf32_vec;
@ -355,9 +407,12 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  params.pcmf32 = pcmf32_vec;
  params.comma_in_time = comma_in_time;
  params.max_len = max_len;
+  params.print_progress = print_progress;
+  params.prompt = prompt;

  Napi::Function callback = info[1].As<Napi::Function>();
-  Worker* worker = new Worker(callback, params);
+  // Create a new Worker class with progress callback support
+  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
  worker->Queue();
  return env.Undefined();
 }
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -19,6 +19,9 @@ const whisperParams = {
  no_timestamps: false,
  audio_ctx: 0,
  max_len: 0,
+  progress_callback: (progress) => {
+      console.log(`progress: ${progress}%`);
+    }
 };

 const arguments = process.argv.slice(2);
--- a/examples/bench.wasm/README.md
+++ b/examples/bench.wasm/README.md
@ -2,7 +2,7 @@

 Benchmark the performance of whisper.cpp in the browser using WebAssembly

-Link: https://whisper.ggerganov.com/bench/
+Link: https://ggerganov.github.io/whisper.cpp/bench.wasm

 Terminal version: [examples/bench](/examples/bench)

@ -15,7 +15,17 @@ cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
+```
+The example can then be started by running a local HTTP server:
+```console
+python3 examples/server.py
+```
+And then opening a browser to the following URL:
+http://localhost:8000/bench.wasm

+To run the example in a different server, you need to copy the following files
+to the server's HTTP path:
+```
 # copy the produced page to your HTTP path
 cp bin/bench.wasm/*       /path/to/html/
 cp bin/libbench.worker.js /path/to/html/
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@ -24,6 +24,8 @@
                overflow-x: scroll;
            }
        </style>
+        <script src="../coi-serviceworker.js"></script>
+        <link rel="icon" href="data:,">
    </head>
    <body>
        <div id="main-container">
@ -36,11 +38,10 @@
            <br><br>

            <b>More examples:</b>
-                <a href="https://whisper.ggerganov.com/">main</a> |
-                <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                <a href="https://whisper.ggerganov.com/command">command</a> |
-                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+                <a href="../">main</a> |
+                <a href="../bench.wasm/">bench</a> |
+                <a href="../stream.wasm">stream</a> |
+                <a href="../command.wasm/">command</a> |

            <br><br>

--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@ -13,7 +13,9 @@
 #include <cstring>

 #if defined(_WIN32)
+#ifndef NOMINMAX
 #define NOMINMAX
+#endif
 #include <windows.h>
 #endif

--- a/examples/coi-serviceworker.js
+++ b/examples/coi-serviceworker.js
@ -0,0 +1,146 @@
+/*! coi-serviceworker v0.1.7 - Guido Zuidhof and contributors, licensed under MIT */
+let coepCredentialless = false;
+if (typeof window === 'undefined') {
+    self.addEventListener("install", () => self.skipWaiting());
+    self.addEventListener("activate", (event) => event.waitUntil(self.clients.claim()));
+
+    self.addEventListener("message", (ev) => {
+        if (!ev.data) {
+            return;
+        } else if (ev.data.type === "deregister") {
+            self.registration
+                .unregister()
+                .then(() => {
+                    return self.clients.matchAll();
+                })
+                .then(clients => {
+                    clients.forEach((client) => client.navigate(client.url));
+                });
+        } else if (ev.data.type === "coepCredentialless") {
+            coepCredentialless = ev.data.value;
+        }
+    });
+
+    self.addEventListener("fetch", function (event) {
+        const r = event.request;
+        if (r.cache === "only-if-cached" && r.mode !== "same-origin") {
+            return;
+        }
+
+        const request = (coepCredentialless && r.mode === "no-cors")
+            ? new Request(r, {
+                credentials: "omit",
+            })
+            : r;
+        event.respondWith(
+            fetch(request)
+                .then((response) => {
+                    if (response.status === 0) {
+                        return response;
+                    }
+
+                    const newHeaders = new Headers(response.headers);
+                    newHeaders.set("Cross-Origin-Embedder-Policy",
+                        coepCredentialless ? "credentialless" : "require-corp"
+                    );
+                    if (!coepCredentialless) {
+                        newHeaders.set("Cross-Origin-Resource-Policy", "cross-origin");
+                    }
+                    newHeaders.set("Cross-Origin-Opener-Policy", "same-origin");
+
+                    return new Response(response.body, {
+                        status: response.status,
+                        statusText: response.statusText,
+                        headers: newHeaders,
+                    });
+                })
+                .catch((e) => console.error(e))
+        );
+    });
+
+} else {
+    (() => {
+        const reloadedBySelf = window.sessionStorage.getItem("coiReloadedBySelf");
+        window.sessionStorage.removeItem("coiReloadedBySelf");
+        const coepDegrading = (reloadedBySelf == "coepdegrade");
+
+        // You can customize the behavior of this script through a global `coi` variable.
+        const coi = {
+            shouldRegister: () => !reloadedBySelf,
+            shouldDeregister: () => false,
+            coepCredentialless: () => true,
+            coepDegrade: () => true,
+            doReload: () => window.location.reload(),
+            quiet: false,
+            ...window.coi
+        };
+
+        const n = navigator;
+        const controlling = n.serviceWorker && n.serviceWorker.controller;
+
+        // Record the failure if the page is served by serviceWorker.
+        if (controlling && !window.crossOriginIsolated) {
+            window.sessionStorage.setItem("coiCoepHasFailed", "true");
+        }
+        const coepHasFailed = window.sessionStorage.getItem("coiCoepHasFailed");
+
+        if (controlling) {
+            // Reload only on the first failure.
+            const reloadToDegrade = coi.coepDegrade() && !(
+                coepDegrading || window.crossOriginIsolated
+            );
+            n.serviceWorker.controller.postMessage({
+                type: "coepCredentialless",
+                value: (reloadToDegrade || coepHasFailed && coi.coepDegrade())
+                    ? false
+                    : coi.coepCredentialless(),
+            });
+            if (reloadToDegrade) {
+                !coi.quiet && console.log("Reloading page to degrade COEP.");
+                window.sessionStorage.setItem("coiReloadedBySelf", "coepdegrade");
+                coi.doReload("coepdegrade");
+            }
+
+            if (coi.shouldDeregister()) {
+                n.serviceWorker.controller.postMessage({ type: "deregister" });
+            }
+        }
+
+        // If we're already coi: do nothing. Perhaps it's due to this script doing its job, or COOP/COEP are
+        // already set from the origin server. Also if the browser has no notion of crossOriginIsolated, just give up here.
+        if (window.crossOriginIsolated !== false || !coi.shouldRegister()) return;
+
+        if (!window.isSecureContext) {
+            !coi.quiet && console.log("COOP/COEP Service Worker not registered, a secure context is required.");
+            return;
+        }
+
+        // In some environments (e.g. Firefox private mode) this won't be available
+        if (!n.serviceWorker) {
+            !coi.quiet && console.error("COOP/COEP Service Worker not registered, perhaps due to private mode.");
+            return;
+        }
+
+        n.serviceWorker.register(window.document.currentScript.src).then(
+            (registration) => {
+                !coi.quiet && console.log("COOP/COEP Service Worker registered", registration.scope);
+
+                registration.addEventListener("updatefound", () => {
+                    !coi.quiet && console.log("Reloading page to make use of updated COOP/COEP Service Worker.");
+                    window.sessionStorage.setItem("coiReloadedBySelf", "updatefound");
+                    coi.doReload();
+                });
+
+                // If the registration is active, but it's not controlling the page
+                if (registration.active && !n.serviceWorker.controller) {
+                    !coi.quiet && console.log("Reloading page to make use of COOP/COEP Service Worker.");
+                    window.sessionStorage.setItem("coiReloadedBySelf", "notcontrolling");
+                    coi.doReload();
+                }
+            },
+            (err) => {
+                !coi.quiet && console.error("COOP/COEP Service Worker failed to register:", err);
+            }
+        );
+    })();
+}
--- a/examples/command.wasm/README.md
+++ b/examples/command.wasm/README.md
@ -3,7 +3,7 @@
 This is a basic Voice Assistant example that accepts voice commands from the microphone.
 It runs in fully in the browser via WebAseembly.

-Online demo: https://whisper.ggerganov.com/command/
+Online demo: https://ggerganov.github.io/whisper.cpp/command.wasm

 Terminal version: [examples/command](/examples/command)

@ -15,9 +15,18 @@ git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
-make -j
+make -j libcommand
+```
+The example can then be started by running a local HTTP server:
+```console
+python3 examples/server.py
+```
+And then opening a browser to the following URL:
+http://localhost:8000/command.wasm/

-# copy the produced page to your HTTP path
+To run the example in a different server, you need to copy the following files
+to the server's HTTP path:
+```
 cp bin/command.wasm/*       /path/to/html/
 cp bin/libcommand.worker.js /path/to/html/
 ```
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@ -24,6 +24,8 @@
                overflow-x: scroll;
            }
        </style>
+        <script src="../coi-serviceworker.js"></script>
+        <link rel="icon" href="data:,">
    </head>
    <body>
        <div id="main-container">
@ -36,11 +38,10 @@
            <br><br>

            <b>More examples:</b>
-                <a href="https://whisper.ggerganov.com/">main</a> |
-                <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                <a href="https://whisper.ggerganov.com/command">command</a> |
-                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+                <a href="../">main</a> |
+                <a href="../bench.wasm/">bench</a> |
+                <a href="../stream.wasm">stream</a> |
+                <a href="../command.wasm/">command</a> |

            <br><br>

--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -247,17 +247,6 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
    return result;
 }

-std::string convert_to_utf8(const std::wstring & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.to_bytes(input);
-}
-
-
-std::wstring convert_to_wstring(const std::string & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.from_bytes(input);
-}
-
 void gpt_split_words(std::string str, std::vector<std::string>& words) {
    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
    const std::regex re(pattern);
--- a/examples/deprecation-warning/CMakeLists.txt
+++ b/examples/deprecation-warning/CMakeLists.txt
@ -1,4 +1,6 @@
 add_executable(main ./deprecation-warning.cpp)
 add_executable(bench ./deprecation-warning.cpp)
-add_executable(stream ./deprecation-warning.cpp)
-add_executable(command ./deprecation-warning.cpp)
+if (WHISPER_SDL2)
+    add_executable(stream ./deprecation-warning.cpp)
+    add_executable(command ./deprecation-warning.cpp)
+endif()
--- a/examples/server.py
+++ b/examples/server.py
@ -0,0 +1,39 @@
+import http.server
+import socketserver
+import os
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent.absolute()
+DIRECTORY = os.path.join(SCRIPT_DIR, "../build-em/bin")
+DIRECTORY = os.path.abspath(DIRECTORY)
+
+class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, directory=DIRECTORY, **kwargs)
+
+    def do_GET(self):
+        # If requesting a worker file from any subdirectory
+        if '.worker.js' in self.path:
+            worker_file = os.path.basename(self.path)
+            worker_path = os.path.join(DIRECTORY, worker_file)
+
+            if os.path.exists(worker_path):
+                self.path = '/' + worker_file
+
+        return super().do_GET()
+
+    def end_headers(self):
+        # Add required headers for SharedArrayBuffer
+        self.send_header("Cross-Origin-Opener-Policy", "same-origin")
+        self.send_header("Cross-Origin-Embedder-Policy", "require-corp")
+        self.send_header("Access-Control-Allow-Origin", "*");
+        super().end_headers()
+
+PORT = 8000
+
+with socketserver.TCPServer(("", PORT), CustomHTTPRequestHandler) as httpd:
+    print(f"Serving directory '{DIRECTORY}' at http://localhost:{PORT}")
+    try:
+        httpd.serve_forever()
+    except KeyboardInterrupt:
+        print("\nServer stopped.")
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1024,6 +1024,11 @@ int main(int argc, char ** argv) {
        // check if the model is in the file system
    });

+    svr.Get(sparams.request_path + "/health", [&](const Request &, Response &res){
+        const std::string health_response = "{\"status\":\"ok\"}";
+        res.set_content(health_response, "application/json");
+    });
+
    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
        const char fmt[] = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
--- a/examples/stream.wasm/README.md
+++ b/examples/stream.wasm/README.md
@ -13,7 +13,17 @@ cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
+```
+The example can then be started by running a local HTTP server:
+```console
+python3 examples/server.py
+```
+And then opening a browser to the following URL:
+http://localhost:8000/stream.wasm

+To run the example in a different server, you need to copy the following files
+to the server's HTTP path:
+```
 # copy the produced page to your HTTP path
 cp bin/stream.wasm/*       /path/to/html/
 cp bin/libstream.worker.js /path/to/html/
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -24,6 +24,8 @@
                overflow-x: scroll;
            }
        </style>
+        <script src="../coi-serviceworker.js"></script>
+        <link rel="icon" href="data:,">
    </head>
    <body>
        <div id="main-container">
@ -36,11 +38,10 @@
            <br><br>

            <b>More examples:</b>
-                <a href="https://whisper.ggerganov.com/">main</a> |
-                <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                <a href="https://whisper.ggerganov.com/command">command</a> |
-                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+                <a href="../">main</a> |
+                <a href="../bench.wasm/">bench</a> |
+                <a href="../stream.wasm">stream</a> |
+                <a href="../command.wasm/">command</a> |

            <br><br>

--- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.10)

 project(whisper.cpp)

-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)

 set(SOURCE_FILES
    ${WHISPER_LIB_DIR}/ggml/src/ggml.c
-    ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/unary-ops.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/binary-ops.cpp
    ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
    ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
    ${WHISPER_LIB_DIR}/src/whisper.cpp
    ${CMAKE_SOURCE_DIR}/jni.c
    )
@ -25,6 +33,7 @@ function(build_library target_name)
    )

    target_link_libraries(${target_name} ${LOG_LIB} android)
+    target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU)

    if (${target_name} STREQUAL "whisper_v8fp16_va")
        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
@ -57,3 +66,4 @@ include_directories(${WHISPER_LIB_DIR}/src)
 include_directories(${WHISPER_LIB_DIR}/include)
 include_directories(${WHISPER_LIB_DIR}/ggml/include)
 include_directories(${WHISPER_LIB_DIR}/ggml/src)
+include_directories(${WHISPER_LIB_DIR}/ggml/src/ggml-cpu)
--- a/examples/whisper.android.java/build.gradle
+++ b/examples/whisper.android.java/build.gradle
@ -16,9 +16,10 @@ allprojects {
  repositories {
    google()
    jcenter()
+    maven { url "https://maven.aliyun.com/repository/gradle-plugin" }
  }
 }

 task clean(type: Delete) {
  delete rootProject.buildDir
-}
+}
--- a/examples/whisper.android.java/gradlew
+++ b/examples/whisper.android.java/gradlew
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -32,6 +32,8 @@ if (NOT GGML_HOME)
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/unary-ops.cpp
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/binary-ops.cpp
        )
 endif()

@ -44,6 +46,8 @@ function(build_library target_name)
        ${SOURCE_FILES}
    )

+    target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU)
+
    if (${target_name} STREQUAL "whisper_v8fp16_va")
        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
        set(GGML_COMPILE_OPTIONS                      -march=armv8.2-a+fp16)
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -11,39 +11,25 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b

 ## Usage

+This example uses the whisper.xcframework which needs to be built first using the following command:
 ```bash
-git clone https://github.com/ggerganov/whisper.cpp
-open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
+./build-xcframework.sh
+```

-# if you don't want to convert a Core ML model, you can skip this step by create dummy model
+A model is also required to be downloaded and can be done using the following command:
+```bash
+./models/download-ggml-model.sh base.en
+```
+
+If you don't want to convert a Core ML model, you can skip this step by creating dummy model:
+```bash
 mkdir models/ggml-base.en-encoder.mlmodelc
 ```

-Make sure to build the project in `Release`:
-
-<img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">
-
-Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag for `ggml.c` in Build Phases.
-This can significantly improve the performance of the transcription:
-
-<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
-
 ## Core ML

-If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
-
-<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
-
-Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
-
-In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
-
-## Metal
-
-You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
-compared to Core ML depending on the model and device that you use.
-
-To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
-This will make both the Encoder and the Decoder run on the GPU.
-
-If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!
+Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the model.
+That is all the needs to be done to use the Core ML model in the app. The converted model is a
+resource in the project and will be used if it is available. Note that the Core ML model is only
+used for the encoder, the decoder which is in the ggml model is still required so both need to
+be available.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -7,7 +7,6 @@
 	objects = {

 /* Begin PBXBuildFile section */
-		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
 		18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
 		18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
@ -15,23 +14,12 @@
 		18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
 		18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
 		18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
-		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
-		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
-		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
-		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
-		18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
-		18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
-		18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
-		18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */; settings = {COMPILER_FLAGS = "-x c++"; }; };
-		18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
-		18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
-		433188B82D3A187C00E3FE79 /* gguf.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 433188B72D3A187C00E3FE79 /* gguf.cpp */; };
-		437B63E22D36280C002A49EC /* ggml-cpu-traits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
-		7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
 		7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */ = {isa = PBXBuildFile; fileRef = 7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */; };
+		DDE3609F2D87EA8C004EA223 /* whisper.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DDE3609E2D87EA8C004EA223 /* whisper.xcframework */; };
+		DDE360A02D87EA8C004EA223 /* whisper.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DDE3609E2D87EA8C004EA223 /* whisper.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */
@ -45,11 +33,20 @@
 			name = "Copy Files";
 			runOnlyForDeploymentPostprocessing = 0;
 		};
+		DDE360A12D87EA8C004EA223 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				DDE360A02D87EA8C004EA223 /* whisper.xcframework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXCopyFilesBuildPhase section */

 /* Begin PBXFileReference section */
-		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
-		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -62,34 +59,7 @@
 		18627C8829052BE000BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		18627C8A29052BE000BD2A04 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		18627C8B29052BE000BD2A04 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
-		18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../include/whisper.h; sourceTree = "<group>"; };
-		18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../src/whisper.cpp; sourceTree = "<group>"; };
-		18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml/src/ggml.c; sourceTree = "<group>"; };
-		18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml/include/ggml.h; sourceTree = "<group>"; };
 		18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
-		18A275FF2C2A9563001C8D37 /* ggml-common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-common.h"; path = "../../../ggml/src/ggml-common.h"; sourceTree = "<group>"; };
-		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
-		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
-		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
-		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
-		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
-		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
-		18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = "<group>"; };
-		18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
-		18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
-		18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
-		18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
-		18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
-		18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
-		18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp"; sourceTree = "<group>"; };
-		18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
-		18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
-		18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
-		18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
-		433188B72D3A187C00E3FE79 /* gguf.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = gguf.cpp; path = ../../../ggml/src/gguf.cpp; sourceTree = "<group>"; };
-		433188B92D3A18A400E3FE79 /* gguf.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = gguf.h; path = ../../../ggml/include/gguf.h; sourceTree = "<group>"; };
-		437B63E02D36280C002A49EC /* ggml-cpu-traits.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-traits.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-traits.h"; sourceTree = "<group>"; };
-		437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu-traits.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-traits.cpp"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
 		7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
 		7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -97,6 +67,7 @@
 		7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder-impl.h"; sourceTree = "<group>"; };
 		7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-decoder-impl.m"; sourceTree = "<group>"; };
 		7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = "ggml-base.en-encoder.mlmodelc"; path = "../../../models/ggml-base.en-encoder.mlmodelc"; sourceTree = "<group>"; };
+		DDE3609E2D87EA8C004EA223 /* whisper.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = whisper.xcframework; path = "../../build-apple/whisper.xcframework"; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -104,6 +75,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				DDE3609F2D87EA8C004EA223 /* whisper.xcframework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -114,6 +86,7 @@
 			isa = PBXGroup;
 			children = (
 				18627C7829052BDF00BD2A04 /* whisper.objc */,
+				DDE3609D2D87EA8C004EA223 /* Frameworks */,
 				18627C7729052BDF00BD2A04 /* Products */,
 			);
 			sourceTree = "<group>";
@ -129,38 +102,9 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
-				18B07DCB2D70411100B3B87C /* ggml-cpp.h */,
-				433188B92D3A18A400E3FE79 /* gguf.h */,
-				433188B72D3A187C00E3FE79 /* gguf.cpp */,
-				18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
-				18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
-				18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */,
-				18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
-				437B63E02D36280C002A49EC /* ggml-cpu-traits.h */,
-				437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */,
-				18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
-				18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
-				18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
-				18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
-				18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
-				18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
-				18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
-				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
-				18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
-				18ABE1572AF556340044A204 /* ggml-backend.cpp */,
-				18ABE1552AF556340044A204 /* ggml-backend.h */,
-				18ABE1582AF556340044A204 /* ggml-impl.h */,
-				18ABE1592AF556340044A204 /* ggml-quants.c */,
-				18ABE1542AF556340044A204 /* ggml-quants.h */,
-				184447182AB211A2007D6BFE /* ggml-alloc.c */,
-				184447192AB211A2007D6BFE /* ggml-alloc.h */,
 				7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */,
 				7FE342442A0C3FA20015A058 /* coreml */,
 				18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
-				18627C9729052C6600BD2A04 /* ggml.h */,
-				18627C9529052C5800BD2A04 /* ggml.c */,
-				18627C9329052C4900BD2A04 /* whisper.cpp */,
-				18627C9229052C2B00BD2A04 /* whisper.h */,
 				18627C7929052BDF00BD2A04 /* AppDelegate.h */,
 				18627C7A29052BDF00BD2A04 /* AppDelegate.m */,
 				18627C7C29052BDF00BD2A04 /* SceneDelegate.h */,
@ -190,6 +134,14 @@
 			path = ../../../src/coreml;
 			sourceTree = "<group>";
 		};
+		DDE3609D2D87EA8C004EA223 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				DDE3609E2D87EA8C004EA223 /* whisper.xcframework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXNativeTarget section */
@ -201,6 +153,7 @@
 				18627C7329052BDF00BD2A04 /* Frameworks */,
 				18627C7429052BDF00BD2A04 /* Resources */,
 				184447202AB21B25007D6BFE /* Copy Files */,
+				DDE360A12D87EA8C004EA223 /* Embed Frameworks */,
 			);
 			buildRules = (
 			);
@ -264,24 +217,10 @@
 			buildActionMask = 2147483647;
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
-				18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
 				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
-				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
-				437B63E22D36280C002A49EC /* ggml-cpu-traits.cpp in Sources */,
-				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
-				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
-				18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
-				18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
-				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */,
-				18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
-				18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
-				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
-				433188B82D3A187C00E3FE79 /* gguf.cpp in Sources */,
-				18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
 				7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@ -359,7 +298,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
+				HEADER_SEARCH_PATHS = "";
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
@ -413,7 +352,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
+				HEADER_SEARCH_PATHS = "";
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
@ -437,7 +376,7 @@
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
+				HEADER_SEARCH_PATHS = "";
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
@ -450,10 +389,12 @@
 				);
 				MARKETING_VERSION = 1.0;
 				MTL_HEADER_SEARCH_PATHS = "";
+				OTHER_CFLAGS = "-DGGML_USE_CPU=ON";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
+				WARNING_CFLAGS = "-Wno-quoted-include-in-framework-header";
 			};
 			name = Debug;
 		};
@ -468,7 +409,7 @@
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
+				HEADER_SEARCH_PATHS = "";
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
@ -481,10 +422,12 @@
 				);
 				MARKETING_VERSION = 1.0;
 				MTL_HEADER_SEARCH_PATHS = "";
+				OTHER_CFLAGS = "-DGGML_USE_CPU=ON";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
+				WARNING_CFLAGS = "-Wno-quoted-include-in-framework-header";
 			};
 			name = Release;
 		};
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -6,8 +6,8 @@
 //

 #import "ViewController.h"
+#import <whisper/whisper.h>

-#import "whisper.h"

 #define NUM_BYTES_PER_BUFFER 16*1024

@ -83,6 +83,19 @@ void AudioInputCallback(void * inUserData,
        stateInp.n_samples = 0;
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
+        // Set up audio session
+        NSError *error = nil;
+
+        [[AVAudioSession sharedInstance] setCategory:AVAudioSessionCategoryRecord error:&error];
+        if (error) {
+            NSLog(@"Error setting audio session category: %@", error);
+        }
+
+        [[AVAudioSession sharedInstance] setActive:YES error:&error];
+        if (error) {
+            NSLog(@"Error activating audio session: %@", error);
+        }
+
    }

    stateInp.isTranscribing = false;
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -33,6 +33,21 @@ sudo xcode-select -switch /Applications/Xcode.app/Contents/Developer

 **Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.

+### Core ML support
+1. Follow all the steps in the `Usage` section, including adding the ggml model file.  
+The ggml model file is required as the Core ML model is only used for the encoder. The
+decoder which is in the ggml model is still required.
+2. Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the
+model.
+3. Add the Core ML model (`models/ggml-base.en-encoder.mlmodelc/`) to `whisper.swiftui.demo/Resources/models` **via Xcode**.
+
+When the example starts running you should now see that it is using the Core ML model:
+```console
+whisper_init_state: loading Core ML model from '/Library/Developer/CoreSimulator/Devices/25E8C27D-0253-4281-AF17-C3F2A4D1D8F4/data/Containers/Bundle/Application/3ADA7D59-7B9C-43B4-A7E1-A87183FC546A/whisper.swiftui.app/models/ggml-base.en-encoder.mlmodelc'
+whisper_init_state: first run on a device may take a while ...
+whisper_init_state: Core ML model loaded
+```
+
 [^1]: I recommend the tiny, base or small models for running on an iOS device.

 [^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -32,8 +32,9 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE_STRICT=0 \
-    -s INITIAL_MEMORY=2000MB \
-    -s TOTAL_MEMORY=2000MB \
+    -s INITIAL_MEMORY=512MB \
+    -s MAXIMUM_MEMORY=2000MB \
+    -s ALLOW_MEMORY_GROWTH=1 \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -22,7 +22,7 @@ audio is limited to 120 seconds.

 ## Live demo

-Link: https://whisper.ggerganov.com
+Link: https://ggerganov.github.io/whisper.cpp/

 ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)

@ -35,7 +35,17 @@ cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
+```
+The example can then be started by running a local HTTP server:
+```console
+python3 examples/server.py
+```
+And then opening a browser to the following URL:
+http://localhost:8000/whisper.wasm

+To run the example in a different server, you need to copy the following files
+to the server's HTTP path:
+```
 # copy the produced page to your HTTP path
 cp bin/whisper.wasm/*    /path/to/html/
 cp bin/libmain.worker.js /path/to/html/
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -24,6 +24,8 @@
                overflow-x: scroll;
            }
        </style>
+        <script src="coi-serviceworker.js"></script>
+        <link rel="icon" href="data:,">
    </head>
    <body>
        <div id="main-container">
@ -47,11 +49,9 @@
                </ul>

            <b>More examples:</b>
-                <a href="https://whisper.ggerganov.com/">main</a> |
-                <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                <a href="https://whisper.ggerganov.com/command">command</a> |
-                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+                <a href="bench.wasm/">bench</a> |
+                <a href="stream.wasm">stream</a> |
+                <a href="command.wasm/">command</a> |

            <hr>

@ -614,7 +614,7 @@
            var nthreads = 8;

            function changeThreads(value) {
-                nthreads = value;
+                nthreads = parseInt(value, 10);
                document.getElementById('threads-value').innerHTML = nthreads;
            }

--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -100,6 +100,10 @@ else()
    set(INS_ENB ON)
 endif()

+message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
+message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
+message(DEBUG "INS_ENB             : ${INS_ENB}")
+
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
@ -123,10 +127,12 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")


 if (WIN32)
@ -186,6 +192,7 @@ option(GGML_OPENMP                          "ggml: use OpenMP"
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
@ -195,6 +202,8 @@ option(GGML_OPENCL                          "ggml: use OpenCL"
 option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
 option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
+                                            "gmml: OpenCL API version to target")

 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
--- a/ggml/cmake/common.cmake
+++ b/ggml/cmake/common.cmake
@ -0,0 +1,26 @@
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@ -5,7 +5,7 @@

 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
-set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

 find_package(Threads REQUIRED)

--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c

 GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
+                                                    const char * cache_dir,
+                                                    size_t free_mem, size_t total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -454,6 +454,7 @@ extern "C" {
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
        GGML_OP_GROUP_NORM,
+        GGML_OP_L2_NORM,

        GGML_OP_MUL_MAT,
        GGML_OP_MUL_MAT_ID,
@ -502,6 +503,7 @@ extern "C" {
        GGML_OP_ADD_REL_POS,
        GGML_OP_RWKV_WKV6,
        GGML_OP_GATED_LINEAR_ATTN,
+        GGML_OP_RWKV_WKV7,

        GGML_OP_UNARY,

@ -1095,6 +1097,18 @@ extern "C" {
            int                   n_groups,
            float                 eps);

+    // l2 normalize along rows
+    // used in rwkv v7
+    GGML_API struct ggml_tensor * ggml_l2_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
    // a - x
    // b - dy
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
@ -1777,11 +1791,11 @@ extern "C" {

 #define GGML_KQ_MASK_PAD 64

-    // q:    [n_embd, n_batch,     n_head,    1]
-    // k:    [n_embd, n_kv,        n_head_kv, 1]
-    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd_k, n_batch,     n_head,    1]
+    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
+    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@ -1890,6 +1904,16 @@ extern "C" {
            struct ggml_tensor  * state,
            float scale);

+    GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * state);
+
    // custom operators

    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1,4 +1,5 @@
 include(CheckCXXCompilerFlag)
+include("../cmake/common.cmake")

 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})

@ -24,33 +25,6 @@ if (NOT MSVC)
    endif()
 endif()

-function(ggml_get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
 if (GGML_FATAL_WARNINGS)
    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        list(APPEND C_FLAGS   -Werror)
@ -91,7 +65,7 @@ if (GGML_LTO)
    endif()
 endif()

-if (GGML_CCACHE)
+if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
    find_program(GGML_CCACHE_FOUND ccache)
    find_program(GGML_SCCACHE_FOUND sccache)

@ -102,7 +76,11 @@ if (GGML_CCACHE)
            set(GGML_CCACHE_VARIANT sccache)
        endif()
        # TODO: should not be set globally
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
+        else ()
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        endif ()
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
    else()
@ -351,6 +329,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
    target_link_libraries(ggml-base PRIVATE dl)
 endif()

+if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
+    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
+endif()
+
 if (BUILD_SHARED_LIBS)
    foreach (target ggml-base ggml)
        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -76,7 +76,14 @@ namespace fs = std::filesystem;
 static std::string path_str(const fs::path & path) {
    std::string u8path;
    try {
+#if defined(__cpp_lib_char8_t)
+        // C++20 and later: u8string() returns std::u8string
+        std::u8string u8str = path.u8string();
+        u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
+#else
+        // C++17: u8string() returns std::string
        u8path = path.u8string();
+#endif
    } catch (...) {
    }
    return u8path;
@ -490,7 +497,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        search_paths.push_back(get_executable_path());
        search_paths.push_back(fs::current_path());
    } else {
-        search_paths.push_back(user_search_path);
+        search_paths.push_back(fs::u8path(user_search_path));
    }

    int best_score = 0;
@ -504,9 +511,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
-                auto filename = entry.path().filename().native();
-                auto ext = entry.path().extension().native();
-                if (filename.find(file_prefix) == 0 && ext == file_extension) {
+                auto filename = entry.path().filename();
+                auto ext = entry.path().extension();
+                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
                    dl_handle_ptr handle { dl_load_library(entry) };
                    if (!handle && !silent) {
                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
@ -537,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        // try to load the base backend
        for (const auto & search_path : search_paths) {
            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
-            fs::path path = search_path.native() + filename.native();
+            fs::path path = search_path / filename;
            if (fs::exists(path)) {
                return get_reg().load_backend(path, silent);
            }
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
        ${CANN_INSTALL_DIR}/acllib/include
    )

-    add_subdirectory(kernels)
    list(APPEND CANN_LIBRARIES
        ascendcl
        nnopbase
        opapi
        acl_op_compiler
-        ascendc_kernels
    )

    file(GLOB GGML_SOURCES_CANN "*.cpp")
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -30,6 +30,7 @@
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_embedding.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
@ -58,7 +59,6 @@
 #include <vector>

 #include "ggml-impl.h"
-#include "kernels/ascendc_kernels.h"

 #define GGML_COMMON_DECL_C

@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    ACL_CHECK(aclDestroyIntArray(repeats));
 }

+/**
+ * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
+ *
+ * @details This function performs a type conversion on the elements of the input tensor `acl_src`
+ *          and stores the results in the destination tensor `acl_dst`. The conversion type is
+ *          determined based on the `dst` tensor's data type.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be cast.
+ * @param acl_dst The destination tensor that will store the casted elements.
+ * @param dst The ggml tensor specifying the target data type.
+ */
+static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                    aclTensor* acl_dst, ggml_tensor* dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
+                                        ggml_cann_type_mapping(dst->type),
+                                        acl_dst, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(ggml_can_repeat(src, dst));
@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 }

 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
+    ggml_tensor* src0 = dst->src[0];

-    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-    ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src->extra = src_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-
-    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
-        ggml_are_same_shape(src, dst)) {
-        cann_copy(ctx, acl_src, acl_dst);
-        ACL_CHECK(aclDestroyTensor(acl_src));
-        ACL_CHECK(aclDestroyTensor(acl_dst));
-        return;
-    }
-    // TODO: simplify
-    if (src->type == GGML_TYPE_F16) {
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f16_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-
-                    aclrtlaunch_ascendc_dup_by_rows_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else if (src->type == GGML_TYPE_F32) {
-        // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
-        //          && nb0 == type_size)
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f32_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            } else {
-                // TODO: dst not contiguous
-                GGML_ABORT("fatal error");
-            }
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else {
-        if (ggml_are_same_shape(src, dst)) {
+    if (ggml_are_same_shape(src0, dst)) {
+        if (dst->type == src0->type) {
            cann_copy(ctx, acl_src, acl_dst);
-            ACL_CHECK(aclDestroyTensor(acl_src));
-            ACL_CHECK(aclDestroyTensor(acl_dst));
-            return;
+        } else {
+            aclnn_cast(ctx, acl_src, acl_dst, dst);
+        }
+    } else {
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+            if (dst->type == src0->type) {
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src0->data, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                return;
+            } else {
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(),
+                    ggml_nelements(dst) * ggml_type_size(dst->type));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = ggml_type_size(dst->type);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                    GGML_MAX_DIMS);
+
+                aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src_trans_buffer, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+                return;
+            }
+        } else if (ggml_is_contiguous(dst)) {
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = ggml_type_size(dst->type);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+            }
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                GGML_MAX_DIMS);
+
+            aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+
+            size_t cpy_size = ggml_nbytes(dst);
+            ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
+                                       cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                       ctx.stream()));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+            return;
+        } else {
+            GGML_ABORT("Unsupport dst is not tontiguous.");
        }
-        GGML_ABORT("fatal error");
    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
 }

 #ifdef __cplusplus
@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
 }

-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
+/**
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
+ *
+ * This function extracts slices from the source tensor (`src_buffer`),
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
+ * embedding operation on them. The embedding operation is applied by iterating
+ * over the last two dimensions of the source tensor, creating the necessary
+ * tensors for the source, index, and output, and executing the embedding operation.
+ *
+ * @param ctx The context for CANN backend operations.
+ * @param src_buffer The source buffer holding the data for the source tensor.
+ * @param src_ne The dimensions of the source tensor.
+ * @param src_nb The strides (byte offsets) of the source tensor.
+ * @param index The index tensor used in the embedding operation.
+ * @param dst The destination tensor where the result will be stored.
+ */
+static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
+                            int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
+                            ggml_tensor* dst) {
+    for (int64_t i = 0; i < src_ne[3]; i++) {
+        for (int64_t j = 0; j < src_ne[2]; j++) {
+            // src
+            int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
+            size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
+            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
+                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_src_ne, acl_src_nb, 2);

-    ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src0->extra = src0_extra_allocator.get();
-    src1->extra = src1_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
+            // index
+            int64_t acl_index_ne[1] = {index->ne[0]};
+            size_t acl_index_nb[1] = {index->nb[0]};
+            aclTensor* acl_index = ggml_cann_create_tensor(
+                (char*)index->data + i * index->nb[2] + j * index->nb[1],
+                ggml_cann_type_mapping(index->type), ggml_element_size(index),
+                acl_index_ne, acl_index_nb, 1);
+
+            // out
+            int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
+            size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
+            aclTensor* acl_out = ggml_cann_create_tensor(
+                (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_out_ne, acl_out_nb, 2);
+
+            uint64_t workspaceSize = 0;
+            aclOpExecutor* executor;
+            void* workspaceAddr = nullptr;
+
+            ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
+                acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
+
+            if (workspaceSize > 0) {
+                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
+                                                         workspaceSize);
+                workspaceAddr = workspace_allocator.get();
+            }
+
+            ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_src_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_index));
+            ACL_CHECK(aclDestroyTensor(acl_out));
+        }
+    }
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // src
+    ggml_tensor* src1 = dst->src[1];  // index

    switch (src0->type) {
        case GGML_TYPE_F32: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f32 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 8) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
-                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
-            }
-#endif
-            aclrtlaunch_ascendc_get_row_f32(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
+                                   dst);
            break;
        }
        case GGML_TYPE_F16: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f16 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 16) != 0) {
-                size_t dst_len =
-                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
-                    ggml_type_size(
-                        GGML_TYPE_F32);  // out is also f32, even input is f16
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
            }
-#endif
-            aclrtlaunch_ascendc_get_row_f16(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
+                src0->ne, src_trans_nb, GGML_MAX_DIMS);
+            aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
+            aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
+                                   src_trans_nb, src1, dst);
+            ACL_CHECK(aclDestroyTensor(acl_src0));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
            break;
        }
-        case GGML_TYPE_Q4_0:
-            aclrtlaunch_ascendc_get_row_q4_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
-            break;
-        case GGML_TYPE_Q8_0:
-            aclrtlaunch_ascendc_get_row_q8_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+        case GGML_TYPE_Q8_0: {
+            // add 1 dim for bcast mul.
+            size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
+                dequant_nb[GGML_MAX_DIMS + 1];
+            int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
+                *dequant_ne;
+            int64_t scale_offset = 0;
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            weight_ne[0] = QK8_0;
+            weight_ne[1] = src0->ne[0] / QK8_0;
+            weight_nb[0] = sizeof(int8_t);
+            weight_nb[1] = weight_nb[0] * weight_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                weight_ne[i] = src0->ne[i - 1];
+                weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,1]
+            scale_ne[0] = 1;
+            scale_ne[1] = src0->ne[0] / QK8_0;
+            scale_nb[0] = sizeof(uint16_t);
+            scale_nb[1] = scale_nb[0] * scale_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                scale_ne[i] = src0->ne[i - 1];
+                scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            dequant_ne = weight_ne;
+            dequant_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
+            }
+
+            scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+            ggml_cann_pool_alloc dequant_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+
+            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
+                GGML_MAX_DIMS + 1);
+            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
+                GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+            aclTensor* dequant_tensor = ggml_cann_create_tensor(
+                dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
+                dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+
+            aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
+            dequant_nb[0] = sizeof(float_t);
+            dequant_ne = src0->ne;
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
+            }
+
+            aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
+                                   dequant_ne, dequant_nb, src1, dst);
+
+            ACL_CHECK(aclDestroyTensor(dequant_tensor));
            break;
+        }
        default:
-            GGML_ABORT("fatal error");
+            GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
            break;
    }
 }
@ -2790,11 +2805,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
                output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
                output_ne_offset);
+            int64_t antiquantGroupSize = 0;
+            if (src0->ne[0] > QK8_0) {
+                antiquantGroupSize = QK8_0;
+            }

            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
-                &workspaceSize, &executor));
+                nullptr, nullptr, nullptr, antiquantGroupSize,
+                acl_output_tensor, &workspaceSize, &executor));
            if (workspaceAddr == nullptr) {
                workspaceAddr = workspace_allocator.alloc(workspaceSize);
            }
@ -2833,7 +2852,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,

                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
-                    nullptr, nullptr, nullptr, nullptr, QK8_0,
+                    nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
                    acl_output_tensor, &workspaceSize, &executor));
                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
                    workspaceAddr, workspaceSize, executor, ctx.stream()));
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
                case GGML_TYPE_Q8_0:
-                    // Current groupsize should not be greater than k-1 in
-                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
-                    if (op->src[0]->ne[0] <= QK8_0) {
-                        return false;
-                    }
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
                case GGML_TYPE_Q4_0:
@ -1709,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            switch (op->src[0]->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q8_0:
                    return true;
                default:
@ -1717,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        } break;
        case GGML_OP_CPY: {
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-                    return true;
-                default:
-                    return false;
+            ggml_tensor *src = op->src[0];
+            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+                  (src->type != GGML_TYPE_F32 &&
+                    src->type != GGML_TYPE_F16)) {
+                // only support F32 and F16.
+                return false;
            }
-        }
+
+            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
+                // unsupport dst is not contiguous.
+                return false;
+            }
+
+            return true;
+        } break;
        case GGML_OP_CONT: {
            // TODO: support GGML_TYPE_BF16
            switch (op->src[0]->type) {
@ -1767,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
            return true;
        }
+        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
-        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

+#ifdef _MSC_VER
+#define GGML_EXTENSION
+#else // _MSC_VER
+#define GGML_EXTENSION __extension__
+#endif // _MSC_VER
+
 #define QK4_0 32
 typedef struct {
    ggml_half d;           // delta
@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b

 #define QK4_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0

 #define QK5_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block

 #define QK8_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -23,6 +23,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        ggml-cpu/amx/mmq.cpp
        ggml-cpu/amx/mmq.h
        ggml-cpu/ggml-cpu-impl.h
+        ggml-cpu/common.h
+        ggml-cpu/binary-ops.h
+        ggml-cpu/binary-ops.cpp
+        ggml-cpu/unary-ops.h
+        ggml-cpu/unary-ops.cpp
        )

    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
@ -287,17 +292,31 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                endif()
            endif()
        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
        message(STATUS "PowerPC detected")
-        execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
-        if (${POWER_M} MATCHES "POWER10")
-            list(APPEND ARCH_FLAGS -mcpu=power10)
-        elseif (${POWER_M} MATCHES "POWER9")
-            list(APPEND ARCH_FLAGS -mcpu=power9)
-        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-            list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+        if (GGML_NATIVE)
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+                file(READ "/proc/cpuinfo" POWER10_M)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
+            endif()
+
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
+
+            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
+            elseif (EXTRACTED_NUMBER EQUAL 9)
+                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+            else()
+                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
+            endif()
        else()
-            list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
+            if (GGML_CPU_POWERPC_CPUTYPE)
+                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
        message(STATUS "loongarch64 detected")
@ -312,7 +331,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
        message(STATUS "RISC-V detected")
        if (GGML_RVV)
-            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            if (GGML_RV_ZFH)
+                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
+            else()
+                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
        message(STATUS "s390x detected")
@ -351,9 +374,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.3.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "060bd2dc64642b091f461cc8dd7426d9")
+        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
--- a/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ggml/src/ggml-cpu/binary-ops.cpp
@ -0,0 +1,158 @@
+#include "binary-ops.h"
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+
+using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
+#endif
+
+static inline float op_add(float a, float b) {
+    return a + b;
+}
+
+static inline float op_sub(float a, float b) {
+    return a - b;
+}
+
+static inline float op_mul(float a, float b) {
+    return a * b;
+}
+
+static inline float op_div(float a, float b) {
+    return a / b;
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        int i10 = i % ne10;
+        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
+
+    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    }
+
+#ifdef GGML_USE_ACCELERATE
+    vDSP_fn_t vDSP_op = nullptr;
+    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        if (op == op_add) {
+            vDSP_op = vDSP_vadd;
+        } else if (op == op_sub) {
+            vDSP_op = vDSP_vsub;
+        } else if (op == op_mul) {
+            vDSP_op = vDSP_vmul;
+        } else if (op == op_div) {
+            vDSP_op = vDSP_vdiv;
+        }
+    }
+#endif
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int64_t i13 = i03 % ne13;
+        const int64_t i12 = i02 % ne12;
+        const int64_t i11 = i01 % ne11;
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+        if (is_src1_contiguous) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t nr0 = ne00 / ne10;
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
+                    if (vDSP_op != nullptr) {
+                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+                        continue;
+                    }
+                }
+#endif
+                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        } else {
+            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
+        }
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float, float)>
+static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_binary_op<op, float, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
+        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
+        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
+    } else {
+        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_add>(params, dst);
+}
+
+void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_sub>(params, dst);
+}
+
+void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_mul>(params, dst);
+}
+
+void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_div>(params, dst);
+}
--- a/ggml/src/ggml-cpu/binary-ops.h
+++ b/ggml/src/ggml-cpu/binary-ops.h
@ -0,0 +1,16 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/common.h
+++ b/ggml/src/ggml-cpu/common.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+
+#ifdef __cplusplus
+
+#include <utility>
+
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline ggml_fp16_t f32_to_f16(float x) {
+    return GGML_FP32_TO_FP16(x);
+}
+
+static inline float f16_to_f32(ggml_fp16_t x) {
+    return GGML_FP16_TO_FP32(x);
+}
+
+static inline ggml_bf16_t f32_to_bf16(float x) {
+    return GGML_FP32_TO_BF16(x);
+}
+
+static inline float bf16_to_f32(ggml_bf16_t x) {
+    return GGML_BF16_TO_FP32(x);
+}
+
+static inline float f32_to_f32(float x) {
+    return x;
+}
+
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+
+template <>
+struct type_conversion_table<ggml_fp16_t> {
+    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
+    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+
+template <>
+struct type_conversion_table<ggml_bf16_t> {
+    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
+    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+
+static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    const int64_t nr  = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    return {ir0, ir1};
+}
+
+#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
        },
        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
+            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
-            /* .require_aligned_m_idx = */ true,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@ -40,7 +40,6 @@ struct lhs_packing_info {
    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
                      size_t lhs_stride, void* lhs_packed);
-    bool require_aligned_m_idx;
 };

 struct rhs_packing_info {
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
            size_t sr = kernel->get_sr();

            // Calculate number of columns to be processed per thread
-            const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
-            const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
+            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
            const size_t m_start = ith * num_m_per_thread;
            size_t m_to_process = num_m_per_thread;
            if ((m_start + m_to_process) > m) {
@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
            if(m_start < m) {
                // Transform LHS
                const size_t src_stride        = src1->nb[1];
-                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
+                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);

-                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
+                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
            }

            ggml_barrier(params->threadpool);
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -55,6 +55,7 @@

 #include <atomic>
 #include <array>
+#include <type_traits>

 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
@ -1092,13 +1093,403 @@ class tinyBLAS_Q0_PPC {
       }
    }

-    template<typename VA, typename VB>
-    void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+    template<typename VA, typename VB, int size>
+    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, VA* vec, std::array<int, size>& comparray) {
        int64_t i, j;
        TA *aoffset = NULL;
        VA *vecOffset = NULL;
        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        VB t1, t2, t3, t4, t5, t6, t7, t8;
+        const vector signed char lowMask = vec_splats((signed char)0xF);
+        const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+        const vector signed char v8 = vec_splats((signed char)0x8);
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+        vector signed int vsum = {0};
+        vector signed int vsum2 = {0};
+
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+
+                i = (cols >> 2);
+                if (i > 0) {
+                    do {
+                        c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                        c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                        c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                        c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
+                        c5[1] = reinterpret_cast<VB>(vec_xl(0, aoffset5->qs));
+                        c6[1] = reinterpret_cast<VB>(vec_xl(0, aoffset6->qs));
+                        c7[1] = reinterpret_cast<VB>(vec_xl(0, aoffset7->qs));
+                        c8[1] = reinterpret_cast<VB>(vec_xl(0, aoffset8->qs));
+
+                        c1[0] = vec_and(c1[1], lowMask);
+                        c1[1] = vec_sr(c1[1], v4);
+                        c1[0] = vec_sub(c1[0], v8);
+                        c1[1] = vec_sub(c1[1], v8);
+                        vsum = vec_sum4s(c1[0], vsum);
+                        vsum2 = vec_sum4s(c1[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c2[0] = vec_and(c2[1], lowMask);
+                        c2[1] = vec_sr(c2[1], v4);
+                        c2[0] = vec_sub(c2[0], v8);
+                        c2[1] = vec_sub(c2[1], v8);
+                        vsum = vec_sum4s(c2[0], vsum);
+                        vsum2 = vec_sum4s(c2[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c3[0] = vec_and(c3[1], lowMask);
+                        c3[1] = vec_sr(c3[1], v4);
+                        c3[0] = vec_sub(c3[0], v8);
+                        c3[1] = vec_sub(c3[1], v8);
+                        vsum = vec_sum4s(c3[0], vsum);
+                        vsum2 = vec_sum4s(c3[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c4[0] = vec_and(c4[1], lowMask);
+                        c4[1] = vec_sr(c4[1], v4);
+                        c4[0] = vec_sub(c4[0], v8);
+                        c4[1] = vec_sub(c4[1], v8);
+                        vsum = vec_sum4s(c4[0], vsum);
+                        vsum2 = vec_sum4s(c4[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c5[0] = vec_and(c5[1], lowMask);
+                        c5[1] = vec_sr(c5[1], v4);
+                        c5[0] = vec_sub(c5[0], v8);
+                        c5[1] = vec_sub(c5[1], v8);
+                        vsum = vec_sum4s(c5[0], vsum);
+                        vsum2 = vec_sum4s(c5[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[4] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c6[0] = vec_and(c6[1], lowMask);
+                        c6[1] = vec_sr(c6[1], v4);
+                        c6[0] = vec_sub(c6[0], v8);
+                        c6[1] = vec_sub(c6[1], v8);
+                        vsum = vec_sum4s(c6[0], vsum);
+                        vsum2 = vec_sum4s(c6[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[5] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c7[0] = vec_and(c7[1], lowMask);
+                        c7[1] = vec_sr(c7[1], v4);
+                        c7[0] = vec_sub(c7[0], v8);
+                        c7[1] = vec_sub(c7[1], v8);
+                        vsum = vec_sum4s(c7[0], vsum);
+                        vsum2 = vec_sum4s(c7[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[6] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c8[0] = vec_and(c8[1], lowMask);
+                        c8[1] = vec_sr(c8[1], v4);
+                        c8[0] = vec_sub(c8[0], v8);
+                        c8[1] = vec_sub(c8[1], v8);
+                        vsum = vec_sum4s(c8[0], vsum);
+                        vsum2 = vec_sum4s(c8[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[7] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        t1 = vec_perm(c1[0], c2[0], swiz1);
+                        t2 = vec_perm(c1[0], c2[0], swiz2);
+                        t3 = vec_perm(c3[0], c4[0], swiz1);
+                        t4 = vec_perm(c3[0], c4[0], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset);
+                        vec_xst(t6, 0, vecOffset+16);
+                        vec_xst(t7, 0, vecOffset+32);
+                        vec_xst(t8, 0, vecOffset+48);
+
+                        t1 = vec_perm(c1[1], c2[1], swiz1);
+                        t2 = vec_perm(c1[1], c2[1], swiz2);
+                        t3 = vec_perm(c3[1], c4[1], swiz1);
+                        t4 = vec_perm(c3[1], c4[1], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+64);
+                        vec_xst(t6, 0, vecOffset+80);
+                        vec_xst(t7, 0, vecOffset+96);
+                        vec_xst(t8, 0, vecOffset+112);
+
+                        t1 = vec_perm(c5[0], c6[0], swiz1);
+                        t2 = vec_perm(c5[0], c6[0], swiz2);
+                        t3 = vec_perm(c7[0], c8[0], swiz1);
+                        t4 = vec_perm(c7[0], c8[0], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+128);
+                        vec_xst(t6, 0, vecOffset+144);
+                        vec_xst(t7, 0, vecOffset+160);
+                        vec_xst(t8, 0, vecOffset+176);
+
+                        t1 = vec_perm(c5[1], c6[1], swiz1);
+                        t2 = vec_perm(c5[1], c6[1], swiz2);
+                        t3 = vec_perm(c7[1], c8[1], swiz1);
+                        t4 = vec_perm(c7[1], c8[1], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+192);
+                        vec_xst(t6, 0, vecOffset+208);
+                        vec_xst(t7, 0, vecOffset+224);
+                        vec_xst(t8, 0, vecOffset+240);
+
+                        aoffset1 += lda;
+                        aoffset2 += lda;
+                        aoffset3 += lda;
+                        aoffset4 += lda;
+                        aoffset5 += lda;
+                        aoffset6 += lda;
+                        aoffset7 += lda;
+                        aoffset8 += lda;
+                        vecOffset += 256;
+                        i--;
+                    } while (i > 0);
+                }
+                j--;
+            } while (j > 0);
+        }
+
+        if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+
+            i = (cols >> 2);
+            if (i > 0) {
+                do {
+                    c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                    c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                    c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                    c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
+
+                    c1[0] = vec_and(c1[1], lowMask);
+                    c1[1] = vec_sr(c1[1], v4);
+                    c1[0] = vec_sub(c1[0], v8);
+                    c1[1] = vec_sub(c1[1], v8);
+                    vsum = vec_sum4s(c1[0], vsum);
+                    vsum2 = vec_sum4s(c1[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c2[0] = vec_and(c2[1], lowMask);
+                    c2[1] = vec_sr(c2[1], v4);
+                    c2[0] = vec_sub(c2[0], v8);
+                    c2[1] = vec_sub(c2[1], v8);
+                    vsum = vec_sum4s(c2[0], vsum);
+                    vsum2 = vec_sum4s(c2[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c3[0] = vec_and(c3[1], lowMask);
+                    c3[1] = vec_sr(c3[1], v4);
+                    c3[0] = vec_sub(c3[0], v8);
+                    c3[1] = vec_sub(c3[1], v8);
+                    vsum = vec_sum4s(c3[0], vsum);
+                    vsum2 = vec_sum4s(c3[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c4[0] = vec_and(c4[1], lowMask);
+                    c4[1] = vec_sr(c4[1], v4);
+                    c4[0] = vec_sub(c4[0], v8);
+                    c4[1] = vec_sub(c4[1], v8);
+                    vsum = vec_sum4s(c4[0], vsum);
+                    vsum2 = vec_sum4s(c4[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats( 0);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    aoffset4 += lda;
+                    vecOffset += 128;
+                    i--;
+                } while (i > 0);
+            }
+        }
+
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            i = (cols >> 2);
+            if (i > 0) {
+                do {
+                    switch(rows) {
+                        case 3: c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                        case 2: c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                        case 1: c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                            break;
+                    }
+                    c1[0] = vec_and(c1[1], lowMask);
+                    c1[1] = vec_sr(c1[1], v4);
+                    c1[0] = vec_sub(c1[0], v8);
+                    c1[1] = vec_sub(c1[1], v8);
+                    vsum = vec_sum4s(c1[0], vsum);
+                    vsum2 = vec_sum4s(c1[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c2[0] = vec_and(c2[1], lowMask);
+                    c2[1] = vec_sr(c2[1], v4);
+                    c2[0] = vec_sub(c2[0], v8);
+                    c2[1] = vec_sub(c2[1], v8);
+                    vsum = vec_sum4s(c2[0], vsum);
+                    vsum2 = vec_sum4s(c2[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c3[0] = vec_and(c3[1], lowMask);
+                    c3[1] = vec_sr(c3[1], v4);
+                    c3[0] = vec_sub(c3[0], v8);
+                    c3[1] = vec_sub(c3[1], v8);
+                    vsum = vec_sum4s(c3[0], vsum);
+                    vsum2 = vec_sum4s(c3[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c4[0] = vec_and(c4[1], lowMask);
+                    c4[1] = vec_sr(c4[1], v4);
+                    c4[0] = vec_sub(c4[0], v8);
+                    c4[1] = vec_sub(c4[1], v8);
+                    vsum = vec_sum4s(c4[0], vsum);
+                    vsum2 = vec_sum4s(c4[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    vecOffset += 128;
+                    i--;
+                } while(i > 0);
+            }
+        }
+    }
+
+    template<typename VA, typename VB>
+    void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+        int64_t i, j;
+        TB *aoffset = NULL;
+        VA *vecOffset = NULL;
+        TB *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TB *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
        __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
        VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
        VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
@ -1111,24 +1502,24 @@ class tinyBLAS_Q0_PPC {
        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};

-        aoffset = const_cast<TA*>(a);
+        aoffset = const_cast<TB*>(a);
        vecOffset = vec;
        j = (rows >> 3);
        if (j > 0) {
            do {
-            aoffset1 = aoffset;
-            aoffset2 = aoffset1 + lda;
-            aoffset3 = aoffset2 + lda;
-            aoffset4 = aoffset3 + lda;
-            aoffset5 = aoffset4 + lda;
-            aoffset6 = aoffset5 + lda;
-            aoffset7 = aoffset6 + lda;
-            aoffset8 = aoffset7 + lda;
-            aoffset += 8 * lda;
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;

-            i = (cols >> 3);
-            if (i > 0) {
-               do {
+                i = (cols >> 3);
+                if (i > 0) {
+                do {
                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@ -1156,10 +1547,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset);
                    vec_xst(t6, 0, vecOffset+16);
@ -1175,10 +1566,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+64);
                    vec_xst(t6, 0, vecOffset+80);
@ -1194,10 +1585,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+128);
                    vec_xst(t6, 0, vecOffset+144);
@ -1213,10 +1604,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+192);
                    vec_xst(t6, 0, vecOffset+208);
@ -1240,11 +1631,11 @@ class tinyBLAS_Q0_PPC {
    }

    if (rows & 4) {
-            aoffset1 = aoffset;
-            aoffset2 = aoffset1 + lda;
-            aoffset3 = aoffset2 + lda;
-            aoffset4 = aoffset3 + lda;
-            aoffset += 4 * lda;
+        aoffset1 = aoffset;
+        aoffset2 = aoffset1 + lda;
+        aoffset3 = aoffset2 + lda;
+        aoffset4 = aoffset3 + lda;
+        aoffset += 4 * lda;

        i = (cols >> 3);
            if (i > 0) {
@ -1311,7 +1702,7 @@ class tinyBLAS_Q0_PPC {
            aoffset2 = aoffset1 + lda;
            aoffset3 = aoffset2 + lda;
            i = (cols >> 3);
-        if (i > 0) {
+            if (i > 0) {
                do {
                    switch(rows) {
                        case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@ -1527,13 +1918,18 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_4x8(int64_t ii, int64_t jj) {
        vec_t vec_A[8], vec_B[16] = {0};
        acc_t acc_0, acc_1;
-        std::array<int, 4> comparray;
+        std::array<int, 4> comparray {};
        vector float fin_res[8] = {0};
        vector float vs[8] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@ -1545,15 +1941,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 4; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 4; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
@ -1565,13 +1963,18 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_8x4(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[8] = {0};
        acc_t acc_0, acc_1;
-        std::array<int, 8> comparray;
+        std::array<int, 8> comparray {};
        vector float fin_res[8] = {0};
        vector float vs[8] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@ -1582,15 +1985,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 8; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 8; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@ -1602,15 +2007,20 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_8x8(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[16] = {0};
        acc_t acc_0, acc_1, acc_2, acc_3;
-        std::array<int, 8> comparray;
+        std::array<int, 8> comparray {};
        vector float fin_res[16] = {0};
        vector float vs[16] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
            __builtin_mma_xxsetaccz(&acc_2);
            __builtin_mma_xxsetaccz(&acc_3);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@ -1624,15 +2034,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 8; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 8; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@ -1653,16 +2065,17 @@ class tinyBLAS_Q0_PPC {
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
-        vec_t vec_A[8], vec_B[8] = {0};
+        vec_t vec_A[8] = {0}, vec_B[8] = {0};
        vector signed int vec_C[4];
        acc_t acc_0;
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;

        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
-            std::array<int, RM> comparray;
+            std::array<int, 4> comparray{};
            vector float res[4] = {0};
            vector float fin_res[4] = {0};
            vector float vs[4] = {0};
@ -1673,7 +2086,11 @@ class tinyBLAS_Q0_PPC {
                __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
                __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
                __builtin_mma_xxsetaccz(&acc_0);
-                packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
+                if (isAblock_q4) {
+                   packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
+                } else {
+                   packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
+                }
                packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
                for(int x = 0; x < 8; x+=4) {
                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@ -1687,17 +2104,18 @@ class tinyBLAS_Q0_PPC {
                    }
                }
                __builtin_mma_disassemble_acc(vec_C, &acc_0);
-                auto aoffset = A+(ii*lda)+l;
-                for (int i = 0; i < RM; i++) {
-                    comparray[i] = 0;
-                    int ca = 0;
-                    const int8_t *at = aoffset->qs;
-                    for (int j = 0; j < 32; j++)
-                        ca += (int)*at++;
-                    comparray[i] = ca;
-                    aoffset += lda;
+                if (!isAblock_q4) {
+                    auto aoffset = A+(ii*lda)+l;
+                    for (int i = 0; i < RM; i++) {
+                        comparray[i] = 0;
+                        int ca = 0;
+                        auto *at = aoffset->qs;
+                        for (int j = 0; j < 32; j++)
+                            ca += (int)*at++;
+                        comparray[i] = ca;
+                        aoffset += lda;
+                    }
                }
-
                for (int i = 0; i < RM; i++) {
                    CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
                    res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
@ -2013,6 +2431,7 @@ class tinyBLAS_PPC {
            }
        }
    }
+
    void KERNEL_4x4(int64_t ii, int64_t jj) {
        vec_t vec_A[4], vec_B[4], vec_C[4];
        acc_t acc_0;
@ -2259,15 +2678,27 @@ class tinyBLAS_PPC {
            vec_t vec_C[4];
            acc_t acc_0;
            __builtin_mma_xxsetaccz(&acc_0);
-            vec_t vec_A[4], vec_B[4];
+            vec_t vec_A[4] {0}, vec_B[4] = {0};
            for (int l=0; l<k; l+=4) {
-                if (RN >= 4 && RM == 1) {
+                /* 'GEMV Forwarding' concept is used in first two conditional loops.
+                 * when one of the matrix has a single row/column, the elements are
+                 * broadcasted, instead of using packing routine to prepack the
+                 * matrix elements.
+                 */
+                if (RM == 1) {
                    TA* a = const_cast<TA*>(A+(ii)*lda+l);
-                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
+                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
                    vec_A[0] = (vec_t)vec_xl(0,a);
                    vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
                    vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
                    vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
+                } else if (RN == 1) {
+                    packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
+                    TB* b = const_cast<TB*>(B+(jj)*ldb+l);
+                    vec_B[0] = (vec_t)vec_xl(0,b);
+                    vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
+                    vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
+                    vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
                } else {
                    packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
@ -2371,8 +2802,10 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
    assert(params->ith < params->nth);

    // only enable sgemm for prompt processing
+#if !defined(__MMA__)
    if (n < 2)
        return false;
+#endif

    if (Ctype != GGML_TYPE_F32)
        return false;
@ -2503,8 +2936,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
-
 #elif defined(__MMA__)
+    //TO-DO: Remove this condition once gemv forwarding is enabled.
        if (n < 8 && n != 4)
           return false;
        if (m < 8 && m != 4)
@ -2516,7 +2949,6 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
-
 #else
        return false;
 #endif
@ -2541,6 +2973,19 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
+#elif defined(__MMA__)
+    //TO-DO: Remove this condition once gemv forwarding is enabled.
+        if (n < 8 && n != 4)
+           return false;
+        if (m < 8 && m != 4)
+           return false;
+        tinyBLAS_Q0_PPC<block_q4_0, block_q8_0, float> tb{
+            k, (const block_q4_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
 #else
        return false;
 #endif
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@ -0,0 +1,186 @@
+#include "unary-ops.h"
+
+static inline float op_abs(float x) {
+    return fabsf(x);
+}
+
+static inline float op_sgn(float x) {
+    return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
+}
+
+static inline float op_neg(float x) {
+    return -x;
+}
+
+static inline float op_step(float x) {
+    return (x > 0.f) ? 1.f : 0.f;
+}
+
+static inline float op_tanh(float x) {
+    return tanhf(x);
+}
+
+static inline float op_elu(float x) {
+    return (x > 0.f) ? x : expm1f(x);
+}
+
+static inline float op_relu(float x) {
+    return (x > 0.f) ? x : 0.f;
+}
+
+static inline float op_sigmoid(float x) {
+    return 1.f / (1.f + expf(-x));
+}
+
+static inline float op_hardsigmoid(float x) {
+    return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_exp(float x) {
+    return expf(x);
+}
+
+static inline float op_hardswish(float x) {
+    return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_sqr(float x) {
+    return x * x;
+}
+
+static inline float op_sqrt(float x) {
+    return sqrtf(x);
+}
+
+static inline float op_sin(float x) {
+    return sinf(x);
+}
+
+static inline float op_cos(float x) {
+    return cosf(x);
+}
+
+static inline float op_log(float x) {
+    return logf(x);
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float)>
+static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_abs>(params, dst);
+}
+
+void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sgn>(params, dst);
+}
+
+void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_neg>(params, dst);
+}
+
+void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_step>(params, dst);
+}
+
+void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_tanh>(params, dst);
+}
+
+void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_elu>(params, dst);
+}
+
+void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_relu>(params, dst);
+}
+
+void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sigmoid>(params, dst);
+}
+
+void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardsigmoid>(params, dst);
+}
+
+void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_exp>(params, dst);
+}
+
+void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardswish>(params, dst);
+}
+
+void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqr>(params, dst);
+}
+
+void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqrt>(params, dst);
+}
+
+void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sin>(params, dst);
+}
+
+void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_cos>(params, dst);
+}
+
+void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_log>(params, dst);
+}
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -41,15 +41,18 @@
 #define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 #define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons

-#define GGML_CUDA_CC_PASCAL       600
-#define GGML_CUDA_CC_DP4A         610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define GGML_CUDA_CC_VOLTA        700
-#define GGML_CUDA_CC_TURING       750
-#define GGML_CUDA_CC_AMPERE       800
-#define GGML_CUDA_CC_ADA_LOVELACE 890
-#define GGML_CUDA_CC_OFFSET_AMD   0x1000000
+#define GGML_CUDA_CC_PASCAL          600
+#define GGML_CUDA_CC_DP4A            610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define GGML_CUDA_CC_VOLTA           700
+#define GGML_CUDA_CC_TURING          750
+#define GGML_CUDA_CC_AMPERE          800
+#define GGML_CUDA_CC_ADA_LOVELACE    890
+#define GGML_CUDA_CC_OFFSET_AMD      0x1000000
+#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
+#define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)

-// GCN/CNDA, wave size is 64
+// AMD
+// GCN/CDNA, wave size is 64
 #define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
 #define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
 #define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
@ -57,21 +60,32 @@
 #define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 #define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300

-// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
+// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000

 #define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
 #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)

-#define GGML_CUDA_CC_QY1        210
-#define GGML_CUDA_CC_QY2        220
+// Moore Threads
+#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
+
+#define GGML_CUDA_CC_QY1  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
+#define GGML_CUDA_CC_QY2  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
+#define GGML_CUDA_CC_NG   (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
+
+#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_QY1(cc)      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
+#define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
+#define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)

 #ifdef __CUDA_ARCH_LIST__
 constexpr bool ggml_cuda_has_arch_impl(int) {
@ -197,9 +211,9 @@ typedef float2 dfloat2;
 #define FP16_MMA_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA

-#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
+#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
 #define FP16_MMA_AVAILABLE
-#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
+#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))

 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define NEW_MMA_AVAILABLE
@ -209,21 +223,21 @@ typedef float2 dfloat2;
 #define CP_ASYNC_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE

-#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
 #define FLASH_ATTN_AVAILABLE
-#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)

 static bool fp16_available(const int cc) {
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 }

 static bool fast_fp16_available(const int cc) {
-    return fp16_available(cc) && cc != 610;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 }

 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fast_fp16_hardware_available(const int cc) {
-    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 }

 // Any FP16 tensor core instructions are available for ggml code.
@ -231,20 +245,20 @@ static bool fp16_mma_available(const int cc) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
    return false;
 #else
-    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
-        GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
 }

 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fp16_mma_hardware_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
-        GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 }

 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 static bool new_mma_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
+    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }

 static bool cp_async_available(const int cc) {
@ -274,6 +288,10 @@ static __device__ void no_device_code(
    __trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning
+
+#if defined(GGML_USE_MUSA)
+    __builtin_unreachable();
+#endif // defined(GGML_USE_MUSA)
 }

 #ifdef __CUDA_ARCH__
@ -395,11 +413,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half

 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
+#elif defined(RDNA3) || defined(RDNA4)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
+#elif defined(RDNA1) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
@ -678,7 +696,7 @@ struct ggml_tensor_extra_gpu {
 };


-#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
+#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
 #define USE_CUDA_GRAPH
 #endif

--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;

-    if (blockIdx.y < ne01) { // src0
+    if (blockIdx.y < (unsigned)ne01) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;

-    if (blockIdx.z < ne02) { // src0
+    if (blockIdx.z < (unsigned)ne02) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
--- a/ggml/src/ggml-cuda/conv-transpose-1d.cu
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@ -34,6 +34,10 @@ static  __global__ void conv_transpose_1d_kernel(
        }
    }
    dst[global_index] = accumulator;
+    GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
+    GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
+    GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
+    GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
 }

 static void conv_transpose_1d_f32_f32_cuda(
@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
    const int p0 = 0;//opts[3];
    const int d0 = 1;//opts[4];

-    const int64_t kernel_size = ggml_nelements(src0);
-    const int64_t input_size = ggml_nelements(src1);
    const int64_t output_size = ggml_nelements(dst);

    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
        return;
    }

-    const src_t * x = (src_t *) vx;
+    const src_t * x = (const src_t *) vx;

    y[i] = x[i];
 }
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -52,12 +52,11 @@ typedef half (*vec_dot_KQ_f16_t)(
 typedef float (*vec_dot_KQ_f32_t)(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);

-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {

    const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);

    T sum = 0.0f;
@ -93,12 +92,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
    return sum;
 }

-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {

    const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);

    T sum = 0.0f;
@ -138,12 +136,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
    return sum;
 }

-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {

    const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);

    T sum = 0.0f;
@ -186,12 +183,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
    return sum;
 }

-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {

    const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);

    T sum = 0.0f;
@ -238,12 +234,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
    return sum;
 }

-template <typename T, int D>
+template <typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {

    const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);

    T sum = 0.0f;
@ -272,12 +267,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
    return sum;
 }

-template <typename T, int D>
+template <typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {

    const half2 * K_h2 = (const half2 *) K_c;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);

@ -321,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

    float vals[sizeof(int)] = {0.0f};
 #pragma unroll
-    for (int l = 0; l < sizeof(int); ++l) {
+    for (int l = 0; l < int(sizeof(int)); ++l) {
        vals[l] = scale * x[4*threadIdx.x + l];
    }

    float amax = fabsf(vals[0]);
    float sum  = vals[0];
 #pragma unroll
-    for (int l = 1; l < sizeof(int); ++l) {
+    for (int l = 1; l < int(sizeof(int)); ++l) {
        amax = fmaxf(amax, fabsf(vals[l]));
        sum += vals[l];
    }
@ -344,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

    if (d != 0.0f) {
 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
            q8[l] = roundf(vals[l] / d);
        }
    }
@ -480,25 +474,25 @@ static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ v
    return x[i];
 }

-template <int D>
+template <int D, int warp_size = WARP_SIZE>
 constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
-    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
-        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
-        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
-        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
-        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
-        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
        nullptr;
 }

-template <int D>
+template <int D, int warp_size = WARP_SIZE>
 constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
-    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
-        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
-        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
-        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
-        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
-        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
        nullptr;
 }

@ -612,50 +606,50 @@ static __global__ void flash_attn_stream_k_fixup(
    *dst = dst_val / rowsum;
 }

-template<int D, int parallel_blocks> // D == head size
+template<int D> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
-        float * __restrict__ dst) {
-    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
-    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
-    dst       +=                 D * gridDim.y*blockIdx.x;
+        float * __restrict__ dst,
+        const int parallel_blocks) {
+    VKQ_parts += parallel_blocks*D * gridDim.z*blockIdx.x;
+    VKQ_meta  += parallel_blocks   * gridDim.z*blockIdx.x;
+    dst       +=                 D * gridDim.z*blockIdx.x;

    const int tid = threadIdx.x;
    __builtin_assume(tid < D);

-    __shared__ float2 meta[parallel_blocks];
+    extern __shared__ float2 meta[];
    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
+        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
    }

    __syncthreads();

    float kqmax = meta[0].x;
-#pragma unroll
    for (int l = 1; l < parallel_blocks; ++l) {
        kqmax = max(kqmax, meta[l].x);
    }

    float VKQ_numerator   = 0.0f;
    float VKQ_denominator = 0.0f;
-#pragma unroll
    for (int l = 0; l < parallel_blocks; ++l) {
        const float diff = meta[l].x - kqmax;
-        const float KQ_max_scale = expf(diff);
+        float KQ_max_scale = expf(diff);
        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
        *((uint32_t *) &KQ_max_scale) &= ftz_mask;

-        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
+        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.z*D + blockIdx.z*D + tid];
        VKQ_denominator += KQ_max_scale * meta[l].y;
    }

-    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
+    dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
 }

+[[noreturn]]
 static void on_no_fattn_vec_case(const int D) {
    if (D == 64) {
        fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
@ -677,11 +671,10 @@ static void on_no_fattn_vec_case(const int D) {
    }
 }

-// parallel_blocks == 0 is stream-k decomposition
-template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
+template <int D, int ncols1, int ncols2, int KQ_stride>
 void launch_fattn(
-    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
-    const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
+    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
+    const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
 ) {
    constexpr int ncols = ncols1 * ncols2;

@ -704,8 +697,6 @@ void launch_fattn(

    GGML_ASSERT(Q->ne[3] == 1);

-    const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
-
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
    const int id  = ggml_cuda_get_device();
@ -755,12 +746,14 @@ void launch_fattn(
        nb23 = nb23*bs*sizeof(half)/ts;
    }

+    int parallel_blocks = 1;
+
    const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
    const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];

    const dim3 block_dim(warp_size, nwarps, 1);
    dim3 blocks_num;
-    if (parallel_blocks == 0) {
+    if (stream_k) {
        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
        const int max_blocks = 2*nsm;
        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
@ -776,9 +769,43 @@ void launch_fattn(

        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
    } else {
-        blocks_num.x = parallel_blocks*ntiles_x;
-        blocks_num.y = Q->ne[2];
-        blocks_num.z = Q->ne[3];
+        GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
+        const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
+
+        int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
+        CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+
+        // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
+        parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
+
+        // parallel_blocks must not be larger than what the tensor size allows:
+        parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
+
+        // If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
+        // Test whether parallel_blocks can be set to a higher value for better efficiency.
+        const int blocks_per_wave = nsm * max_blocks_per_sm;
+        int nwaves_best = 0;
+        int efficiency_percent_best = 0;
+        for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) {
+            const int nblocks_total = ntiles_total * parallel_blocks_test;
+            const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
+            const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
+
+            // Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead.
+            if (efficiency_percent_best >= 90 && nwaves > nwaves_best) {
+                break;
+            }
+
+            if (efficiency_percent > efficiency_percent_best) {
+                nwaves_best = nwaves;
+                efficiency_percent_best = efficiency_percent;
+                parallel_blocks = parallel_blocks_test;
+            }
+        }
+
+        blocks_num.x = ntiles_x;
+        blocks_num.y = parallel_blocks;
+        blocks_num.z = Q->ne[2]*Q->ne[3];

        if (parallel_blocks > 1) {
            dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
@ -805,13 +832,12 @@ void launch_fattn(
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

    GGML_ASSERT(block_dim.x % warp_size == 0);
-    GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size);
    fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
        (const char *) Q->data,
        K_data,
        V_data,
        mask ? ((const char *) mask->data) : nullptr,
-        (parallel_blocks) > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
+        !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
@ -823,7 +849,7 @@ void launch_fattn(
    );
    CUDA_CHECK(cudaGetLastError());

-    if constexpr (parallel_blocks == 0) {
+    if (stream_k) {
        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
            const dim3 block_dim_combine(D, 1, 1);
            const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
@ -832,13 +858,14 @@ void launch_fattn(
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
        }
-    } else if constexpr (parallel_blocks > 1) {
+    } else if (parallel_blocks > 1) {
        const dim3 block_dim_combine(D, 1, 1);
-        const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
+        const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
+        const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);

-        flash_attn_combine_results<D, parallel_blocks>
-            <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
+        flash_attn_combine_results<D>
+            <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
+            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
    }
    CUDA_CHECK(cudaGetLastError());
 }
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // CP_ASYNC_AVAILABLE

 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
+    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
+    GGML_UNUSED(kb0);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        __syncthreads();
    }
 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
+    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
+    GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
        (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
         ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
@ -970,7 +995,8 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
    }

-    launch_fattn<D, ncols1, ncols2, 0, KQ_per_iter>(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, true, true);
+    launch_fattn<D, ncols1, ncols2, KQ_per_iter>
+        (ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, FATTN_KQ_STRIDE, true, true, true);
 }


@ -984,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64)

 // Kernels with ncols == 128 are only 4% faster due to register pressure.
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@ -4,7 +4,7 @@

 #define FATTN_KQ_STRIDE_TILE_F16 64

-template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@ -58,18 +58,17 @@ static __global__ void flash_attn_tile_ext_f16(

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@ -105,8 +104,7 @@ static __global__ void flash_attn_tile_ext_f16(

    __syncthreads();

-    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        half kqmax_new[ncols/nwarps];
@ -271,24 +269,36 @@ static __global__ void flash_attn_tile_ext_f16(
            const int i0 = i00 + 2*threadIdx.x;

            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val /= __half2half2(kqsum_j);
            }
-            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] =  __low2float(dst_val);
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
+            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] =  __low2float(dst_val);
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = __high2float(dst_val);
        }

-        if (parallel_blocks != 1 && threadIdx.x == 0) {
-            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        if (gridDim.y != 1 && threadIdx.x == 0) {
+            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }

-template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+template <int cols_per_block, bool use_logit_softcap>
 void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
@ -296,15 +306,17 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    D             = 64;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        case 128: {
            constexpr int    D             = 128;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        default: {
            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
@ -324,37 +336,22 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten

    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
-        constexpr int parallel_blocks = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    if (Q->ne[1] <= 32) {
-        constexpr int cols_per_block = 32;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    constexpr int cols_per_block = 32;
-    constexpr int parallel_blocks = 1;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@ -4,7 +4,7 @@

 #define FATTN_KQ_STRIDE_TILE_F32 32

-template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@ -58,18 +58,17 @@ static __global__ void flash_attn_tile_ext_f32(

    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");

@ -103,8 +102,7 @@ static __global__ void flash_attn_tile_ext_f32(

    __syncthreads();

-    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        float kqmax_new[ncols/nwarps];
@ -269,25 +267,37 @@ static __global__ void flash_attn_tile_ext_f32(
            const int i0 = i00 + 2*threadIdx.x;

            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val.x /= kqsum_j;
                dst_val.y /= kqsum_j;
            }
-            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
+            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = dst_val.x;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = dst_val.y;
        }

-        if (parallel_blocks != 1 && threadIdx.x == 0) {
-            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        if (gridDim.y != 1 && threadIdx.x == 0) {
+            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }

-template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+template <int cols_per_block, bool use_logit_softcap>
 void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
@ -295,15 +305,17 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    D             = 64;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        case 128: {
            constexpr int    D             = 128;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        default: {
            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
@ -320,37 +332,22 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten

    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
-        constexpr int parallel_blocks = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    if (Q->ne[1] <= 32) {
-        constexpr int cols_per_block = 32;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    constexpr int cols_per_block = 32;
-    constexpr int parallel_blocks = 1;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"

-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@ -55,17 +55,16 @@ static __global__ void flash_attn_vec_ext_f16(
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.y              + nb01*ic0;
-    K += nb12*(blockIdx.y / gqa_ratio);
-    V += nb22*(blockIdx.y / gqa_ratio);
+    Q += nb02* blockIdx.z              + nb01*ic0;
+    K += nb12*(blockIdx.z / gqa_ratio);
+    V += nb22*(blockIdx.z / gqa_ratio);

    const half * maskh = (const half   *)  mask + ne11*ic0;

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@ -172,8 +171,7 @@ static __global__ void flash_attn_vec_ext_f16(

    half2 VKQ[ncols] = {{0.0f, 0.0f}};

-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
@ -283,29 +281,41 @@ static __global__ void flash_attn_vec_ext_f16(
        kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);

        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
-        if (parallel_blocks == 1) {
+        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
    }

-    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }

-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
@ -325,65 +335,48 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

    if (Q->ne[1] == 1) {
-        constexpr int cols_per_block  = 1;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] == 2) {
-        constexpr int cols_per_block  = 2;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 2;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] <= 4) {
-        constexpr int cols_per_block  = 4;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

-    if (Q->ne[1] <= 8) {
-        constexpr int cols_per_block  = 8;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    constexpr int cols_per_block  = 8;
-    constexpr int parallel_blocks = 1;
+    constexpr int cols_per_block = 8;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }

--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"

-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@ -55,16 +55,15 @@ static __global__ void flash_attn_vec_ext_f32(
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.y              + nb01*ic0;
-    K += nb12*(blockIdx.y / gqa_ratio);
-    V += nb22*(blockIdx.y / gqa_ratio); // K and V have same shape
+    Q += nb02* blockIdx.z              + nb01*ic0;
+    K += nb12*(blockIdx.z / gqa_ratio);
+    V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
    const half * maskh = (const half   *)  mask + ne11*ic0;

-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
@ -167,8 +166,7 @@ static __global__ void flash_attn_vec_ext_f32(

    float VKQ[ncols] = {0.0f};

-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        float kqmax_new_arr[ncols];
@ -268,29 +266,39 @@ static __global__ void flash_attn_vec_ext_f32(
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);

        float dst_val = VKQ[j_VKQ];
-        if (parallel_blocks == 1) {
+        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
    }

-    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }

-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
@ -307,65 +315,48 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

    if (Q->ne[1] == 1) {
-        constexpr int cols_per_block  = 1;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] == 2) {
-        constexpr int cols_per_block  = 2;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 2;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] <= 4) {
-        constexpr int cols_per_block  = 4;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

-    if (Q->ne[1] <= 8) {
-        constexpr int cols_per_block  = 8;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    constexpr int cols_per_block  = 8;
-    constexpr int parallel_blocks = 1;
+    constexpr int cols_per_block = 8;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }

--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@ -18,7 +18,7 @@ namespace wmma = rocwmma;
 #endif // FP16_MMA_AVAILABLE

 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
-template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
+template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap>
 __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
@ -67,8 +67,7 @@ static __global__ void flash_attn_ext_f16(

    constexpr int warp_size = ggml_cuda_get_physical_warp_size();

-    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
-    const int ip  =        blockIdx.x % parallel_blocks;  // Index in group of blocks running for the same column in parallel.
+    const int ic0 = ncols*blockIdx.x; // Index of the first Q/QKV column to work on.

    static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
    static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
@ -91,16 +90,16 @@ static __global__ void flash_attn_ext_f16(
    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y              + nb01*ic0);
-    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y / gqa_ratio));
-    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float * Q_f   = (const float *) (Q + nb02* blockIdx.z              + nb01*ic0);
+    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.z / gqa_ratio));
+    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half  * maskh = (const half  *)  mask + (nb31/sizeof(half))* ic0;
    const half2 * mask2 = (const half2 *)  mask + (nb31/sizeof(half))*(ic0/2);

    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);

@ -176,7 +175,7 @@ static __global__ void flash_attn_ext_f16(
    __syncthreads();

    // Iterate over ne11 == previous tokens:
-    for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
        // Calculate tile of KQ:
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
@ -395,7 +394,7 @@ static __global__ void flash_attn_ext_f16(
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;

        float KQ_rowsum_j;
        if (std::is_same<KQ_acc_t, float>::value) {
@ -411,13 +410,13 @@ static __global__ void flash_attn_ext_f16(
                break;
            }
            float dst_val = VKQ[j_VKQ*D_padded + i];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val /= KQ_rowsum_j;
            }
-            dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
+            dst[j_dst*gridDim.z*D + blockIdx.z*D + i] = dst_val;
        }

-        if (parallel_blocks == 1 || threadIdx.x != 0) {
+        if (gridDim.y == 1 || threadIdx.x != 0) {
            continue;
        }

@ -428,10 +427,20 @@ static __global__ void flash_attn_ext_f16(
            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
        }
        dst_meta_val.y = KQ_rowsum_j;
-        dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
+        dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
+    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
+    GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
+    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
 }

@ -462,59 +471,26 @@ static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 template <int D, int cols_per_block, typename KQ_acc_t>
 void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
-    const ggml_tensor * Q   = dst->src[0];

    constexpr int nwarps = 4;

    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
-    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
-    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;

    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (4*blocks_num_pb1 < 2*nsm) {
-        constexpr int parallel_blocks = 4;
-        fattn_kernel_t fattn_kernel;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        } else {
-            constexpr bool use_logit_softcap = true;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
-        return;
-    }
-    if (2*blocks_num_pb1 < 2*nsm) {
-        constexpr int parallel_blocks = 2;
-        fattn_kernel_t fattn_kernel;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        } else {
-            constexpr bool use_logit_softcap = true;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
-        return;
-    }
-    constexpr int parallel_blocks = 1;
    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        fattn_kernel = flash_attn_ext_f16<
-            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    } else {
        constexpr bool use_logit_softcap = true;
        fattn_kernel = flash_attn_ext_f16<
-            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    }
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
 }

 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -253,7 +253,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);

-    if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
+    if (GGML_CUDA_CC_IS_AMD(cc)) {
 #if defined(GGML_HIP_ROCWMMA_FATTN)
        if (fp16_mma_available(cc)) {
            ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
@ -281,13 +281,13 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    if (!fp16_mma_available(cc)) {
        if (prec == GGML_PREC_DEFAULT) {
-            if (Q->ne[1] <= 8) {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
                ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
            } else {
                ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
            }
        } else {
-            if (Q->ne[1] <= 8) {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
                ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
            } else {
                ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
@ -296,17 +296,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
        return;
    }

-    const int gqa_ratio = Q->ne[2] / K->ne[2];
-    const bool mma_fast_for_bs1 = fp16_mma_available(cc) && gqa_ratio % 2 == 0 &&
-        K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && mask;
-    if (Q->ne[1] == 1 && Q->ne[0] % (2*warp_size) == 0 && !mma_fast_for_bs1) {
+    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
+    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
+    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
+    const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (prec == GGML_PREC_DEFAULT || Q->ne[0] <= 128);
+    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
        if (prec == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
-            return;
-        } else if(Q->ne[0] <= 128) {
+        } else {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
-            return;
        }
+        return;
    }

    // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -31,12 +31,14 @@
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
+#include "ggml-cuda/ssm-conv.cuh"
+#include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-#include "ggml-cuda/wkv6.cuh"
+#include "ggml-cuda/wkv.cuh"
 #include "ggml-cuda/gla.cuh"
 #include "ggml.h"

@ -262,9 +264,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
                      id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
                      device_vmm ? "yes" : "no", prop.warpSize);
 #elif defined(GGML_USE_MUSA)
-        // TODO: refine the .cc to reflect MUSA's actual CC capabilities
+        // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
+        info.devices[id].warp_size = 32;
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
-        info.devices[id].cc = 100*prop.major + 10*prop.minor;
+        info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
+        info.devices[id].cc += prop.minor * 0x10;
        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 #else
@ -1186,11 +1190,11 @@ static void ggml_cuda_op_mul_mat_cublas(
    // ldc == nrows of the matrix that cuBLAS writes into
    int64_t ldc = id == ctx.device ? ne0 : row_diff;

-    const int compute_capability = ggml_cuda_info().devices[id].cc;
+    const int cc = ggml_cuda_info().devices[id].cc;

    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;

-    if (compute_capability >= GGML_CUDA_CC_VOLTA && use_fp16) {
+    if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
        if (src0->type != GGML_TYPE_F16) {
@ -1214,7 +1218,7 @@ static void ggml_cuda_op_mul_mat_cublas(

        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));

-        if (GGML_CUDA_CC_IS_CDNA(compute_capability)) {
+        if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
            const float alpha = 1.0f;
            const float beta = 0.0f;
            CUBLAS_CHECK(
@ -1757,7 +1761,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
        beta  = &beta_f32;
    }

-    if (GGML_CUDA_CC_IS_CDNA(ggml_cuda_info().devices[ctx.device].cc)) {
+    int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
+    if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
        cu_compute_type = CUBLAS_COMPUTE_32F;
        alpha = &alpha_f32;
        beta  = &beta_f32;
@ -1834,7 +1840,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
    }
 #endif

-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+    if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
    }
@ -2196,6 +2202,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_GROUP_NORM:
            ggml_cuda_op_group_norm(ctx, dst);
            break;
+        case GGML_OP_L2_NORM:
+            ggml_cuda_op_l2_norm(ctx, dst);
+            break;
        case GGML_OP_CONCAT:
            ggml_cuda_op_concat(ctx, dst);
            break;
@ -2289,6 +2298,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
+        case GGML_OP_SSM_CONV:
+            ggml_cuda_op_ssm_conv(ctx, dst);
+            break;
+        case GGML_OP_SSM_SCAN:
+            ggml_cuda_op_ssm_scan(ctx, dst);
+            break;
        case GGML_OP_ARGSORT:
            ggml_cuda_op_argsort(ctx, dst);
            break;
@ -2304,6 +2319,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_GATED_LINEAR_ATTN:
            ggml_cuda_op_gated_linear_attn(ctx, dst);
            break;
+        case GGML_OP_RWKV_WKV7:
+            ggml_cuda_op_rwkv_wkv7(ctx, dst);
+            break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            ggml_cuda_cross_entropy_loss_back(ctx, dst);
            break;
@ -2610,13 +2628,15 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,

 static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {

+#if CUDART_VERSION >= 12000
    cudaGraphExecUpdateResultInfo result_info;
-#ifdef __HIP_PLATFORM_AMD__
-    hipGraphNode_t errorNode;
-    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
-#else
    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
-#endif
+#else
+    cudaGraphNode_t errorNode;
+    cudaGraphExecUpdateResult result_info;
+    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
+#endif // CUDART_VERSION >= 12000
+
    if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
@ -3159,6 +3179,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            break;
        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
+        case GGML_OP_L2_NORM:
            return true;
        case GGML_OP_RMS_NORM_BACK:
            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
@ -3180,6 +3201,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_LOG:
+        case GGML_OP_SSM_SCAN:
+        case GGML_OP_SSM_CONV:
            return true;
        case GGML_OP_CONT:
            return op->src[0]->type != GGML_TYPE_BF16;
@ -3213,11 +3236,22 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_RWKV_WKV6:
        case GGML_OP_GATED_LINEAR_ATTN:
+        case GGML_OP_RWKV_WKV7:
            return true;
        case GGML_OP_FLASH_ATTN_EXT: {
 #ifndef FLASH_ATTN_AVAILABLE
            return false;
 #endif // FLASH_ATTN_AVAILABLE
+            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                // different head sizes of K and V are not supported yet
+                return false;
+            }
+            if (op->src[0]->ne[0] == 192) {
+                return false;
+            }
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
                return false;
            }
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -26,6 +26,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
        : "=r"(ret) : "r"(x));
 #else
+    GGML_UNUSED(x);
    NO_DEVICE_CODE;
 #endif // defined(NEW_MMA_AVAILABLE)
    return ret;
@ -178,6 +179,7 @@ namespace ggml_cuda_mma {
            : "l"(xs));
 #else
        load_generic(xs0, stride);
+        GGML_UNUSED(t);
 #endif // NEW_MMA_AVAILABLE
    }

--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q(
    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
    // Also its fixup needs to allocate a temporary buffer in the memory pool.
    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
-    const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
-        cc < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
+    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
+        ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};

    switch (src0->type) {
@ -145,9 +145,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
    return true;
 #endif //GGML_CUDA_FORCE_MMQ

-    if (cc < GGML_CUDA_CC_OFFSET_AMD) {
+    if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
        return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
    }

-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -90,7 +90,7 @@ struct tile_x_sizes {

 static int get_mmq_x_max_host(const int cc) {
    return new_mma_available(cc) ? 128 :
-        ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ?
+        GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
 #ifdef GGML_CUDA_FORCE_MMQ
            128                     : 64;
 #else
@ -123,8 +123,8 @@ static constexpr __device__ int get_mmq_x_max_device() {
 }

 static int get_mmq_y_host(const int cc) {
-    return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
-        (ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? 128 : 64);
+    return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
+        ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
 }

 static constexpr __device__ int get_mmq_y_device() {
@ -945,7 +945,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
        }
    }
 #else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@ -1024,7 +1024,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
    }

 #pragma unroll
-    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
+    for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
        const int k0 = k00 + k01;

 #pragma unroll
@ -1035,19 +1035,34 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;

-                if (k01 < WARP_SIZE/2) {
-                    constexpr int ns = 2;
-                    sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
-                        &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
-                        &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
-                        &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
-                } else {
-                    constexpr int ns = 1;
-                    sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
-                        &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
-                        &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
-                        &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
-                }
+                constexpr int ns = 2;
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
+                    &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
+                    &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
+                    &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
+            }
+        }
+    }
+
+    // Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
+    // As a workaround 2 separate loops are used instead.
+#pragma unroll
+    for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                constexpr int ns = 1;
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
+                    &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
+                    &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
+                    &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
            }
        }
    }
@ -1176,7 +1191,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
        }
    }
 #else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@ -1253,7 +1268,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const float d = bxi->d;

 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
            x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
        }
 #else
@ -1376,7 +1391,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);

 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
        }
    }
@ -1517,7 +1532,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);

 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
        }
    }
@ -1810,7 +1825,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
        }
    }
 #else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@ -2570,6 +2585,8 @@ static __device__ void mul_mat_q_process_tile(
    } else {
        write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
    }
+
+    GGML_UNUSED(ne00); GGML_UNUSED(ne10);
 }


@ -2577,9 +2594,9 @@ static __device__ void mul_mat_q_process_tile(

 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
    __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 #else
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
    __launch_bounds__(WARP_SIZE*nwarps, 1)
@ -2695,7 +2712,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
        const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;

        // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
-        if (it != blockIdx.x || jt != blockIdx.y) {
+        if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
            continue;
        }

@ -2772,14 +2789,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a

    const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
    static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
    if (!shmem_limit_raised[id]) {
        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
        shmem_limit_raised[id] = true;
    }
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)

    const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
@ -2825,14 +2842,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 template <ggml_type type>
 void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
    const int id    = ggml_cuda_get_device();
-    const int nsm   = ggml_cuda_info().devices[id].nsm;
    const int cc    = ggml_cuda_info().devices[id].cc;
    const int smpbo = ggml_cuda_info().devices[id].smpbo;

    const int mmq_x_max = get_mmq_x_max_host(cc);
    const int mmq_y = get_mmq_y_host(cc);
    const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
-    const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
+    const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;

    int mmq_x_best  = 0;
    int nparts_best = INT_MAX;
--- a/Show More
+++ b/Show More