mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 17:15:19 +00:00
Compare commits
38 Commits
v1.7.4-pre
...
v1.7.4
Author | SHA1 | Date | |
---|---|---|---|
8a9ad7844d | |||
eb874b3a3c | |||
eb78e3a3f1 | |||
ece3ff88f6 | |||
9366544991 | |||
95583942ed | |||
2e93cb6a2f | |||
de5cd60d1c | |||
3fcba3e58b | |||
cea5f1c52f | |||
2112462db4 | |||
fc84ecd445 | |||
8de1e99907 | |||
499af9294a | |||
bcf937c216 | |||
b8d90953d7 | |||
60a422147b | |||
3387415bad | |||
536ca3ec89 | |||
a4bb983190 | |||
39c205f555 | |||
6d502f33dc | |||
5ea27d089d | |||
1462d92588 | |||
7ba1a41f47 | |||
5ea088636f | |||
f32ddb3b1c | |||
79b75ece03 | |||
6348d73e55 | |||
fb36a1538a | |||
c81b8b910b | |||
85b60f31d0 | |||
227b5ffa36 | |||
36a64a253f | |||
c84b83c370 | |||
5136fd92c2 | |||
7d55637f0b | |||
0994506054 |
257
.github/workflows/build.yml
vendored
257
.github/workflows/build.yml
vendored
@ -1,5 +1,15 @@
|
||||
name: CI
|
||||
on: [push, pull_request]
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
ubuntu_image: "ubuntu:22.04"
|
||||
@ -12,7 +22,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
arch: [linux/amd64, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -32,6 +42,58 @@ jobs:
|
||||
cmake -B build
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-latest-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/arm64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-latest-arm-v7:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/arm/v7]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
|
||||
@ -74,7 +136,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
arch: [linux/amd64, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -95,6 +157,62 @@ jobs:
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-gcc-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/arm64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-gcc-arm-v7:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/arm/v7]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-clang:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@ -430,72 +548,73 @@ jobs:
|
||||
name: whisper-blas-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
# TODO: fix and re-enable
|
||||
# windows-cublas:
|
||||
# runs-on: windows-2019
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
# arch: [x64]
|
||||
# cublas: [ON]
|
||||
# sdl2: [ON]
|
||||
# cuda-toolkit: [12.2.0, 11.8.0]
|
||||
# include:
|
||||
# - arch: x64
|
||||
# s2arc: x64
|
||||
# - sdl2: ON
|
||||
# s2ver: 2.28.5
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v2
|
||||
#
|
||||
# - name: Install CUDA Toolkit
|
||||
# id: cuda-toolkit
|
||||
# uses: Jimver/cuda-toolkit@v0.2.15
|
||||
# with:
|
||||
# cuda: '${{ matrix.cuda-toolkit }}'
|
||||
#
|
||||
# - name: Fetch SDL2 and set SDL2_DIR
|
||||
# if: matrix.sdl2 == 'ON'
|
||||
# run: |
|
||||
# C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
# 7z x sdl2.zip
|
||||
# echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
#
|
||||
# - name: Configure
|
||||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# -DGGML_CUDA=${{ matrix.cublas }}
|
||||
# -DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||
#
|
||||
# - name: Build ${{ matrix.cuda-toolkit }}
|
||||
# run: |
|
||||
# cd ./build
|
||||
# cmake --build . --config ${{ matrix.build }}
|
||||
#
|
||||
# - name: Copy CUDA DLLs
|
||||
# run: >
|
||||
# Copy-Item -PassThru
|
||||
# -Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
|
||||
# -Include cudart64_*,cublas64_*,cublasLt64_*
|
||||
# -Destination build/bin/${{ matrix.build }}
|
||||
#
|
||||
# - name: Copy SDL2.dll
|
||||
# if: matrix.sdl2 == 'ON'
|
||||
# run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
#
|
||||
# - name: Upload binaries
|
||||
# if: matrix.sdl2 == 'ON'
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
windows-cublas:
|
||||
runs-on: windows-2019
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [x64]
|
||||
cublas: [ON]
|
||||
sdl2: [ON]
|
||||
cuda-toolkit: [12.2.0, 11.8.0]
|
||||
include:
|
||||
- arch: x64
|
||||
sdl2: ON
|
||||
sdl2_ver: 2.28.5
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
||||
- name: Install CUDA Toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: '${{ matrix.cuda-toolkit }}'
|
||||
|
||||
- name: Install 7-Zip
|
||||
run: choco install 7zip -y
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
Invoke-WebRequest -Uri https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.sdl2_ver }}/SDL2-devel-${{ matrix.sdl2_ver }}-VC.zip -OutFile sdl2.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" > SDL2_PATH.txt
|
||||
|
||||
- name: Configure CMake
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }} ^
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
|
||||
-DGGML_CUDA=${{ matrix.cublas }} ^
|
||||
-DCMAKE_CUDA_ARCHITECTURES=all ^
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }} ^
|
||||
-DSDL2_DIR="%SDL2_DIR%"
|
||||
|
||||
- name: Build Project
|
||||
shell: cmd
|
||||
run: |
|
||||
cd ./build
|
||||
cmake --build . --config ${{ matrix.build }}
|
||||
|
||||
- name: Copy CUDA DLLs
|
||||
run: |
|
||||
Get-ChildItem "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/" -Filter "*.dll" |
|
||||
Copy-Item -Destination "build/bin/${{ matrix.build }}"
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.arch }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
emscripten:
|
||||
runs-on: ubuntu-latest
|
||||
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
|
||||
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
|
||||
#TODO: the cuda image keeps failing - disable for now
|
||||
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
|
||||
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
|
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.3)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
|
2
Makefile
2
Makefile
@ -64,6 +64,6 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 larg
|
||||
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "" ; \
|
||||
./build/bin/main -m models/ggml-$@.bin -f $$f ; \
|
||||
./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
|
||||
echo "" ; \
|
||||
done
|
||||
|
@ -7,7 +7,7 @@
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.7.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.3) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
@ -293,7 +293,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
||||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
@ -381,9 +381,9 @@ The [stream](examples/stream) tool samples the audio every half a second and run
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
cmake --build build --config Release
|
||||
./build/bin/stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
|
||||
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
|
||||
```
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.3",
|
||||
"version": "1.7.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
4
bindings/ruby/.gitignore
vendored
4
bindings/ruby/.gitignore
vendored
@ -1,5 +1,3 @@
|
||||
LICENSE
|
||||
pkg/
|
||||
lib/whisper.so
|
||||
lib/whisper.bundle
|
||||
lib/whisper.dll
|
||||
lib/whisper.*
|
||||
|
@ -60,10 +60,10 @@ You also can use shorthand for pre-converted models:
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
```
|
||||
|
||||
You can see the list of prepared model names by `Whisper::Model.preconverted_models.keys`:
|
||||
You can see the list of prepared model names by `Whisper::Model.pre_converted_models.keys`:
|
||||
|
||||
```ruby
|
||||
puts Whisper::Model.preconverted_models.keys
|
||||
puts Whisper::Model.pre_converted_models.keys
|
||||
# tiny
|
||||
# tiny.en
|
||||
# tiny-q5_1
|
||||
@ -87,8 +87,9 @@ whisper = Whisper::Context.new("path/to/your/model.bin")
|
||||
Or, you can download model files:
|
||||
|
||||
```ruby
|
||||
model_uri = Whisper::Model::URI.new("http://example.net/uri/of/your/model.bin")
|
||||
whisper = Whisper::Context.new(model_uri)
|
||||
whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
|
||||
# Or
|
||||
whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
|
||||
```
|
||||
|
||||
See [models][] page for details.
|
||||
@ -222,6 +223,17 @@ end
|
||||
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
|
||||
Development
|
||||
-----------
|
||||
|
||||
% git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
% cd whisper.cpp/bindings/ruby
|
||||
% rake test
|
||||
|
||||
First call of `rake test` builds an extension and downloads a model for testing. After that, you add tests in `tests` directory and modify `ext/ruby_whisper.cpp`.
|
||||
|
||||
If something seems wrong on build, running `rake clean` solves some cases.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
@ -49,6 +49,7 @@ static ID id_length;
|
||||
static ID id_next;
|
||||
static ID id_new;
|
||||
static ID id_to_path;
|
||||
static ID id_URI;
|
||||
static ID id_pre_converted_models;
|
||||
|
||||
static bool is_log_callback_finalized = false;
|
||||
@ -283,6 +284,17 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
|
||||
if (!NIL_P(pre_converted_model)) {
|
||||
whisper_model_file_path = pre_converted_model;
|
||||
}
|
||||
if (TYPE(whisper_model_file_path) == T_STRING) {
|
||||
const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
|
||||
if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
}
|
||||
if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
if (rb_respond_to(whisper_model_file_path, id_to_path)) {
|
||||
whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
|
||||
}
|
||||
@ -837,7 +849,7 @@ static VALUE ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment) {
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* full_get_segment_no_speech_prob -> Float
|
||||
* full_get_segment_no_speech_prob(segment_index) -> Float
|
||||
*/
|
||||
static VALUE ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment) {
|
||||
ruby_whisper *rw;
|
||||
@ -1755,7 +1767,7 @@ static VALUE ruby_whisper_c_model_type(VALUE self) {
|
||||
|
||||
static VALUE ruby_whisper_error_initialize(VALUE self, VALUE code) {
|
||||
const int c_code = NUM2INT(code);
|
||||
char *raw_message;
|
||||
const char *raw_message;
|
||||
switch (c_code) {
|
||||
case -2:
|
||||
raw_message = "failed to compute log mel spectrogram";
|
||||
@ -1802,6 +1814,7 @@ void Init_whisper() {
|
||||
id_next = rb_intern("next");
|
||||
id_new = rb_intern("new");
|
||||
id_to_path = rb_intern("to_path");
|
||||
id_URI = rb_intern("URI");
|
||||
id_pre_converted_models = rb_intern("pre_converted_models");
|
||||
|
||||
mWhisper = rb_define_module("Whisper");
|
||||
@ -1941,6 +1954,8 @@ void Init_whisper() {
|
||||
rb_define_method(cModel, "n_mels", ruby_whisper_c_model_n_mels, 0);
|
||||
rb_define_method(cModel, "ftype", ruby_whisper_c_model_ftype, 0);
|
||||
rb_define_method(cModel, "type", ruby_whisper_c_model_type, 0);
|
||||
|
||||
rb_require("whisper/model/uri");
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
#ifndef __RUBY_WHISPER_H
|
||||
#define __RUBY_WHISPER_H
|
||||
#ifndef RUBY_WHISPER_H
|
||||
#define RUBY_WHISPER_H
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
|
@ -1,2 +0,0 @@
|
||||
require "whisper.so"
|
||||
require "whisper/model/uri"
|
@ -1,163 +1,163 @@
|
||||
require "whisper.so"
|
||||
require "uri"
|
||||
require "net/http"
|
||||
require "time"
|
||||
require "pathname"
|
||||
require "io/console/size"
|
||||
|
||||
class Whisper::Model
|
||||
class URI
|
||||
def initialize(uri)
|
||||
@uri = URI(uri)
|
||||
end
|
||||
module Whisper
|
||||
class Model
|
||||
class URI
|
||||
def initialize(uri)
|
||||
@uri = URI(uri)
|
||||
end
|
||||
|
||||
def to_path
|
||||
cache
|
||||
cache_path.to_path
|
||||
end
|
||||
def to_path
|
||||
cache
|
||||
cache_path.to_path
|
||||
end
|
||||
|
||||
def clear_cache
|
||||
path = cache_path
|
||||
path.delete if path.exist?
|
||||
end
|
||||
def clear_cache
|
||||
path = cache_path
|
||||
path.delete if path.exist?
|
||||
end
|
||||
|
||||
private
|
||||
private
|
||||
|
||||
def cache_path
|
||||
base_cache_dir/@uri.host/@uri.path[1..]
|
||||
end
|
||||
def cache_path
|
||||
base_cache_dir/@uri.host/@uri.path[1..]
|
||||
end
|
||||
|
||||
def base_cache_dir
|
||||
base = case RUBY_PLATFORM
|
||||
when /mswin|mingw/
|
||||
ENV.key?("LOCALAPPDATA") ? Pathname(ENV["LOCALAPPDATA"]) : Pathname(Dir.home)/"AppData/Local"
|
||||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
def base_cache_dir
|
||||
base = case RUBY_PLATFORM
|
||||
when /mswin|mingw/
|
||||
ENV.key?("LOCALAPPDATA") ? Pathname(ENV["LOCALAPPDATA"]) : Pathname(Dir.home)/"AppData/Local"
|
||||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
|
||||
def cache
|
||||
path = cache_path
|
||||
headers = {}
|
||||
headers["if-modified-since"] = path.mtime.httpdate if path.exist?
|
||||
request @uri, headers
|
||||
path
|
||||
end
|
||||
def cache
|
||||
path = cache_path
|
||||
headers = {}
|
||||
headers["if-modified-since"] = path.mtime.httpdate if path.exist?
|
||||
request @uri, headers
|
||||
path
|
||||
end
|
||||
|
||||
def request(uri, headers)
|
||||
Net::HTTP.start uri.host, uri.port, use_ssl: uri.scheme == "https" do |http|
|
||||
request = Net::HTTP::Get.new(uri, headers)
|
||||
http.request request do |response|
|
||||
case response
|
||||
when Net::HTTPNotModified
|
||||
def request(uri, headers)
|
||||
Net::HTTP.start uri.host, uri.port, use_ssl: uri.scheme == "https" do |http|
|
||||
request = Net::HTTP::Get.new(uri, headers)
|
||||
http.request request do |response|
|
||||
case response
|
||||
when Net::HTTPNotModified
|
||||
# noop
|
||||
when Net::HTTPOK
|
||||
download response
|
||||
when Net::HTTPRedirection
|
||||
request URI(response["location"]), headers
|
||||
else
|
||||
return if headers.key?("if-modified-since") # Use cache file
|
||||
when Net::HTTPOK
|
||||
download response
|
||||
when Net::HTTPRedirection
|
||||
request URI(response["location"]), headers
|
||||
else
|
||||
return if headers.key?("if-modified-since") # Use cache file
|
||||
|
||||
raise "#{response.code} #{response.message}\n#{response.body}"
|
||||
raise "#{response.code} #{response.message}\n#{response.body}"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download(response)
|
||||
path = cache_path
|
||||
path.dirname.mkpath unless path.dirname.exist?
|
||||
downloading_path = Pathname("#{path}.downloading")
|
||||
size = response.content_length
|
||||
downloading_path.open "wb" do |file|
|
||||
downloaded = 0
|
||||
response.read_body do |chunk|
|
||||
file << chunk
|
||||
downloaded += chunk.bytesize
|
||||
show_progress downloaded, size
|
||||
def download(response)
|
||||
path = cache_path
|
||||
path.dirname.mkpath unless path.dirname.exist?
|
||||
downloading_path = Pathname("#{path}.downloading")
|
||||
size = response.content_length
|
||||
downloading_path.open "wb" do |file|
|
||||
downloaded = 0
|
||||
response.read_body do |chunk|
|
||||
file << chunk
|
||||
downloaded += chunk.bytesize
|
||||
show_progress downloaded, size
|
||||
end
|
||||
$stderr.puts
|
||||
end
|
||||
$stderr.puts
|
||||
end
|
||||
downloading_path.rename path
|
||||
end
|
||||
|
||||
def show_progress(current, size)
|
||||
progress_rate_available = size && $stderr.tty?
|
||||
|
||||
unless @prev
|
||||
@prev = Time.now
|
||||
$stderr.puts "Downloading #{@uri} to #{cache_path}"
|
||||
downloading_path.rename path
|
||||
end
|
||||
|
||||
now = Time.now
|
||||
def show_progress(current, size)
|
||||
progress_rate_available = size && $stderr.tty?
|
||||
|
||||
if progress_rate_available
|
||||
return if now - @prev < 1 && current < size
|
||||
unless @prev
|
||||
@prev = Time.now
|
||||
$stderr.puts "Downloading #{@uri} to #{cache_path}"
|
||||
end
|
||||
|
||||
progress_width = 20
|
||||
progress = current.to_f / size
|
||||
arrow_length = progress * progress_width
|
||||
arrow = "=" * (arrow_length - 1) + ">" + " " * (progress_width - arrow_length)
|
||||
line = "[#{arrow}] (#{format_bytesize(current)} / #{format_bytesize(size)})"
|
||||
padding = ' ' * ($stderr.winsize[1] - line.size)
|
||||
$stderr.print "\r#{line}#{padding}"
|
||||
else
|
||||
return if now - @prev < 1
|
||||
now = Time.now
|
||||
|
||||
$stderr.print "."
|
||||
if progress_rate_available
|
||||
return if now - @prev < 1 && current < size
|
||||
|
||||
progress_width = 20
|
||||
progress = current.to_f / size
|
||||
arrow_length = progress * progress_width
|
||||
arrow = "=" * (arrow_length - 1) + ">" + " " * (progress_width - arrow_length)
|
||||
line = "[#{arrow}] (#{format_bytesize(current)} / #{format_bytesize(size)})"
|
||||
padding = ' ' * ($stderr.winsize[1] - line.size)
|
||||
$stderr.print "\r#{line}#{padding}"
|
||||
else
|
||||
return if now - @prev < 1
|
||||
|
||||
$stderr.print "."
|
||||
end
|
||||
@prev = now
|
||||
end
|
||||
|
||||
def format_bytesize(bytesize)
|
||||
return "0.0 B" if bytesize.zero?
|
||||
|
||||
units = %w[B KiB MiB GiB TiB]
|
||||
exp = (Math.log(bytesize) / Math.log(1024)).to_i
|
||||
format("%.1f %s", bytesize.to_f / 1024 ** exp, units[exp])
|
||||
end
|
||||
@prev = now
|
||||
end
|
||||
|
||||
def format_bytesize(bytesize)
|
||||
return "0.0 B" if bytesize.zero?
|
||||
@pre_converted_models = %w[
|
||||
tiny
|
||||
tiny.en
|
||||
tiny-q5_1
|
||||
tiny.en-q5_1
|
||||
tiny-q8_0
|
||||
base
|
||||
base.en
|
||||
base-q5_1
|
||||
base.en-q5_1
|
||||
base-q8_0
|
||||
small
|
||||
small.en
|
||||
small.en-tdrz
|
||||
small-q5_1
|
||||
small.en-q5_1
|
||||
small-q8_0
|
||||
medium
|
||||
medium.en
|
||||
medium-q5_0
|
||||
medium.en-q5_0
|
||||
medium-q8_0
|
||||
large-v1
|
||||
large-v2
|
||||
large-v2-q5_0
|
||||
large-v2-q8_0
|
||||
large-v3
|
||||
large-v3-q5_0
|
||||
large-v3-turbo
|
||||
large-v3-turbo-q5_0
|
||||
large-v3-turbo-q8_0
|
||||
].each_with_object({}) {|name, models|
|
||||
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
}
|
||||
|
||||
units = %w[B KiB MiB GiB TiB]
|
||||
exp = (Math.log(bytesize) / Math.log(1024)).to_i
|
||||
format("%.1f %s", bytesize.to_f / 1024 ** exp, units[exp])
|
||||
class << self
|
||||
attr_reader :pre_converted_models
|
||||
end
|
||||
end
|
||||
|
||||
@pre_converted_models = {}
|
||||
%w[
|
||||
tiny
|
||||
tiny.en
|
||||
tiny-q5_1
|
||||
tiny.en-q5_1
|
||||
tiny-q8_0
|
||||
base
|
||||
base.en
|
||||
base-q5_1
|
||||
base.en-q5_1
|
||||
base-q8_0
|
||||
small
|
||||
small.en
|
||||
small.en-tdrz
|
||||
small-q5_1
|
||||
small.en-q5_1
|
||||
small-q8_0
|
||||
medium
|
||||
medium.en
|
||||
medium-q5_0
|
||||
medium.en-q5_0
|
||||
medium-q8_0
|
||||
large-v1
|
||||
large-v2
|
||||
large-v2-q5_0
|
||||
large-v2-q8_0
|
||||
large-v3
|
||||
large-v3-q5_0
|
||||
large-v3-turbo
|
||||
large-v3-turbo-q5_0
|
||||
large-v3-turbo-q8_0
|
||||
].each do |name|
|
||||
@pre_converted_models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
end
|
||||
|
||||
class << self
|
||||
attr_reader :pre_converted_models
|
||||
end
|
||||
end
|
||||
|
153
bindings/ruby/sig/whisper.rbs
Normal file
153
bindings/ruby/sig/whisper.rbs
Normal file
@ -0,0 +1,153 @@
|
||||
module Whisper
|
||||
interface _Samples
|
||||
def length: () -> Integer
|
||||
def each: { (Float) -> void } -> void
|
||||
end
|
||||
|
||||
type log_callback = ^(Integer level, String message, Object user_data) -> void
|
||||
type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
|
||||
type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
|
||||
type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
|
||||
|
||||
LOG_LEVEL_NONE: Integer
|
||||
LOG_LEVEL_INFO: Integer
|
||||
LOG_LEVEL_WARN: Integer
|
||||
LOG_LEVEL_ERROR: Integer
|
||||
LOG_LEVEL_DEBUG: Integer
|
||||
LOG_LEVEL_CONT: Integer
|
||||
|
||||
def self.lang_max_id: () -> Integer
|
||||
def self.lang_id: (string name) -> Integer
|
||||
def self.lang_str: (Integer id) -> String
|
||||
def self.lang_str_full: (Integer id) -> String
|
||||
def self.log_set=: (log_callback) -> log_callback
|
||||
def self.finalize_log_callback: (void) -> void # Second argument of ObjectSpace.define_finalizer
|
||||
|
||||
class Context
|
||||
def initialize: (string | _ToPath | ::URI::HTTP ) -> void
|
||||
def transcribe: (string, Params) -> void
|
||||
| (string, Params) { (String) -> void } -> void
|
||||
def model_n_vocab: () -> Integer
|
||||
def model_n_audio_ctx: () -> Integer
|
||||
def model_n_audio_state: () -> Integer
|
||||
def model_n_text_head: () -> Integer
|
||||
def model_n_text_layer: () -> Integer
|
||||
def model_n_mels: () -> Integer
|
||||
def model_ftype: () -> Integer
|
||||
def model_type: () -> String
|
||||
def full_n_segments: () -> Integer
|
||||
def full_lang_id: () -> Integer
|
||||
def full_get_segment_t0: (Integer) -> Integer
|
||||
def full_get_segment_t1: (Integer) -> Integer
|
||||
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
|
||||
def full_get_segment_text: (Integer) -> String
|
||||
def full_get_segment_no_speech_prob: (Integer) -> Float
|
||||
def full: (Params, Array[Float], ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer) -> void
|
||||
def full_parallel: (Params, Array[Float], ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer?, Integer) -> void
|
||||
def each_segment: { (Segment) -> void } -> void
|
||||
| () -> Enumerator[Segment]
|
||||
def model: () -> Model
|
||||
end
|
||||
|
||||
class Params
|
||||
def initialize: () -> void
|
||||
def language=: (String) -> String # TODO: Enumerate lang names
|
||||
def language: () -> String
|
||||
def translate=: (boolish) -> boolish
|
||||
def translate: () -> (true | false)
|
||||
def no_context=: (boolish) -> boolish
|
||||
def no_context: () -> (true | false)
|
||||
def single_segment=: (boolish) -> boolish
|
||||
def single_segment: () -> (true | false)
|
||||
def print_special=: (boolish) -> boolish
|
||||
def print_special: () -> (true | false)
|
||||
def print_progress=: (boolish) -> boolish
|
||||
def print_progress: () -> (true | false)
|
||||
def print_realtime=: (boolish) -> boolish
|
||||
def print_realtime: () -> (true | false)
|
||||
def print_timestamps=: (boolish) -> boolish
|
||||
def print_timestamps: () -> (true | false)
|
||||
def suppress_blank=: (boolish) -> boolish
|
||||
def suppress_blank: () -> (true | false)
|
||||
def suppress_nst=: (boolish) -> boolish
|
||||
def suppress_nst: () -> (true | false)
|
||||
def token_timestamps=: (boolish) -> boolish
|
||||
def token_timestamps: () -> (true | false)
|
||||
def split_on_word=: (boolish) -> boolish
|
||||
def split_on_word: () -> (true | false)
|
||||
def initial_prompt=: (_ToS) -> _ToS
|
||||
def initial_prompt: () -> String
|
||||
def diarize=: (boolish) -> boolish
|
||||
def diarize: () -> (true | false)
|
||||
def offset=: (Integer) -> Integer
|
||||
def offset: () -> Integer
|
||||
def duration=: (Integer) -> Integer
|
||||
def duration: () -> Integer
|
||||
def max_text_tokens=: (Integer) -> Integer
|
||||
def max_text_tokens: () -> Integer
|
||||
def temperature=: (Float) -> Float
|
||||
def temperature: () -> Float
|
||||
def max_initial_ts=: (Float) -> Float
|
||||
def max_initial_ts: () -> Float
|
||||
def length_penalty=: (Float) -> Float
|
||||
def length_penalty: () -> Float
|
||||
def temperature_inc=: (Float) -> Float
|
||||
def temperature_inc: () -> Float
|
||||
def entropy_thold=: (Float) -> Float
|
||||
def entropy_thold: () -> Float
|
||||
def logprob_thold=: (Float) -> Float
|
||||
def logprob_thold: () -> Float
|
||||
def no_speech_thold=: (Float) -> Float
|
||||
def no_speech_thold: () -> Float
|
||||
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
|
||||
def new_segment_callback_user_data=: (Object) -> Object
|
||||
def progress_callback=: (progress_callback) -> progress_callback
|
||||
def progress_callback_user_data=: (Object) -> Object
|
||||
def abort_callback=: (abort_callback) -> abort_callback
|
||||
def abort_callback_user_data=: (Object) -> Object
|
||||
def on_new_segment: { (Segment) -> void } -> void
|
||||
def on_progress: { (Integer) -> void } -> void
|
||||
def abort_on: { (Object) -> boolish } -> void
|
||||
end
|
||||
|
||||
class Model
|
||||
def self.pre_converted_models: () -> Hash[String, Model::URI]
|
||||
def initialize: () -> void
|
||||
def n_vocab: () -> Integer
|
||||
def n_audio_ctx: () -> Integer
|
||||
def n_audio_state: () -> Integer
|
||||
def n_audio_head: () -> Integer
|
||||
def n_audio_layer: () -> Integer
|
||||
def n_text_ctx: () -> Integer
|
||||
def n_text_state: () -> Integer
|
||||
def n_text_head: () -> Integer
|
||||
def n_text_layer: () -> Integer
|
||||
def n_mels: () -> Integer
|
||||
def ftype: () -> Integer
|
||||
def type: () -> String
|
||||
|
||||
class URI
|
||||
def initialize: (string | ::URI::HTTP) -> void
|
||||
def to_path: -> String
|
||||
def clear_cache: -> void
|
||||
end
|
||||
end
|
||||
|
||||
class Segment
|
||||
def initialize: () -> void
|
||||
def start_time: () -> Integer
|
||||
def end_time: () -> Integer
|
||||
def speaker_next_turn?: () -> (true | false)
|
||||
def text: () -> String
|
||||
def no_speech_prob: () -> Float
|
||||
end
|
||||
|
||||
class Error < StandardError
|
||||
attr_reader code: Integer
|
||||
|
||||
def initialize: (Integer) -> void
|
||||
end
|
||||
end
|
@ -68,4 +68,42 @@ class TestModel < TestBase
|
||||
assert_path_exist path
|
||||
assert_equal 147964211, File.size(path)
|
||||
end
|
||||
|
||||
def test_uri_string
|
||||
path = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin"
|
||||
whisper = Whisper::Context.new(path)
|
||||
model = whisper.model
|
||||
|
||||
assert_equal 51864, model.n_vocab
|
||||
assert_equal 1500, model.n_audio_ctx
|
||||
assert_equal 512, model.n_audio_state
|
||||
assert_equal 8, model.n_audio_head
|
||||
assert_equal 6, model.n_audio_layer
|
||||
assert_equal 448, model.n_text_ctx
|
||||
assert_equal 512, model.n_text_state
|
||||
assert_equal 8, model.n_text_head
|
||||
assert_equal 6, model.n_text_layer
|
||||
assert_equal 80, model.n_mels
|
||||
assert_equal 1, model.ftype
|
||||
assert_equal "base", model.type
|
||||
end
|
||||
|
||||
def test_uri
|
||||
path = URI("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin")
|
||||
whisper = Whisper::Context.new(path)
|
||||
model = whisper.model
|
||||
|
||||
assert_equal 51864, model.n_vocab
|
||||
assert_equal 1500, model.n_audio_ctx
|
||||
assert_equal 512, model.n_audio_state
|
||||
assert_equal 8, model.n_audio_head
|
||||
assert_equal 6, model.n_audio_layer
|
||||
assert_equal 448, model.n_text_ctx
|
||||
assert_equal 512, model.n_text_state
|
||||
assert_equal 8, model.n_text_head
|
||||
assert_equal 6, model.n_text_layer
|
||||
assert_equal 80, model.n_mels
|
||||
assert_equal 1, model.ftype
|
||||
assert_equal "base", model.type
|
||||
end
|
||||
end
|
||||
|
@ -43,6 +43,7 @@ struct whisper_params {
|
||||
float word_thold = 0.01f;
|
||||
float entropy_thold = 2.40f;
|
||||
float logprob_thold = -1.00f;
|
||||
float no_speech_thold = 0.6f;
|
||||
float grammar_penalty = 100.0f;
|
||||
float temperature = 0.0f;
|
||||
float temperature_inc = 0.2f;
|
||||
@ -70,6 +71,7 @@ struct whisper_params {
|
||||
bool log_score = false;
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = false;
|
||||
bool suppress_nst = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string prompt;
|
||||
@ -104,6 +106,11 @@ static char * whisper_param_turn_lowercase(char * in){
|
||||
return in;
|
||||
}
|
||||
|
||||
static char * requires_value_error(const std::string & arg) {
|
||||
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
@ -122,21 +129,23 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
|
||||
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
||||
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
||||
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
|
||||
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
|
||||
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
||||
@ -148,30 +157,31 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
||||
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
|
||||
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
||||
else if (arg == "-fp" || arg == "--font-path") { params.font_path = ARGV_NEXT; }
|
||||
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
||||
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
||||
else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
|
||||
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
||||
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(ARGV_NEXT); }
|
||||
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
||||
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(argv[++i]); }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
|
||||
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
|
||||
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
||||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
|
||||
else if ( arg == "--prompt") { params.prompt = ARGV_NEXT; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); }
|
||||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = ARGV_NEXT; }
|
||||
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = ARGV_NEXT; }
|
||||
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
|
||||
else if ( arg == "--suppress-regex") { params.suppress_regex = argv[++i]; }
|
||||
else if ( arg == "--grammar") { params.grammar = argv[++i]; }
|
||||
else if ( arg == "--grammar-rule") { params.grammar_rule = argv[++i]; }
|
||||
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
|
||||
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
|
||||
else if ( arg == "--suppress-regex") { params.suppress_regex = ARGV_NEXT; }
|
||||
else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; }
|
||||
else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; }
|
||||
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
@ -202,6 +212,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
||||
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
||||
fprintf(stderr, " -nth N, --no-speech-thold N [%-7.2f] no speech threshold\n", params.no_speech_thold);
|
||||
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
|
||||
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
|
||||
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
||||
@ -234,6 +245,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
||||
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false");
|
||||
fprintf(stderr, " --suppress-regex REGEX [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
|
||||
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
|
||||
fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str());
|
||||
@ -1121,9 +1133,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
wparams.no_speech_thold = params.no_speech_thold;
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
|
||||
wparams.suppress_nst = params.suppress_nst;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
||||
|
||||
const auto & grammar_parsed = params.grammar_parsed;
|
||||
|
@ -24,6 +24,10 @@ int main(int argc, char** argv) {
|
||||
replacement_filename = "whisper-cli";
|
||||
}
|
||||
|
||||
if (filename == "main.exe") {
|
||||
replacement_filename = "whisper-cli.exe";
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
|
||||
|
@ -27,7 +27,7 @@
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
|
||||
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; };
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */; };
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
|
||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
||||
@ -88,7 +88,7 @@
|
||||
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = "<group>"; };
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
|
||||
@ -134,7 +134,7 @@
|
||||
children = (
|
||||
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */,
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */,
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
|
||||
@ -278,7 +278,7 @@
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
|
||||
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */,
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */,
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
|
||||
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
|
||||
|
@ -74,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
||||
endif()
|
||||
|
||||
# general
|
||||
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
||||
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||
option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
|
||||
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
||||
@ -120,8 +120,9 @@ endif()
|
||||
option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_SVE "ggml: enable SVE" OFF)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
|
||||
|
||||
if (WIN32)
|
||||
@ -251,26 +252,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
||||
install(TARGETS ggml-base LIBRARY)
|
||||
|
||||
# FIXME: this should be done in the backend cmake files
|
||||
if (GGML_METAL)
|
||||
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
||||
install(
|
||||
FILES src/ggml-metal/ggml-metal.metal
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
GROUP_READ
|
||||
WORLD_READ
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_STANDALONE)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||
|
@ -1564,17 +1564,6 @@ extern "C" {
|
||||
int d1, // dilation dimension 1
|
||||
bool is_2D);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@ -1592,6 +1581,23 @@ extern "C" {
|
||||
int s, // stride
|
||||
int d); // dilation
|
||||
|
||||
// depthwise
|
||||
// TODO: this is very likely wrong for some cases! - needs more testing
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int p0, // padding
|
||||
int d0); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int d0); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@ -1611,7 +1617,6 @@ extern "C" {
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
|
||||
// kernel size is a->ne[0] x a->ne[1]
|
||||
// stride is equal to kernel size
|
||||
// padding is zero
|
||||
@ -1638,6 +1643,18 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// depthwise
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
|
||||
# write the shared library to the output directory
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
@ -289,9 +290,9 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AVX-VNNI or AMX
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
else ()
|
||||
|
@ -66,6 +66,26 @@
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(str);
|
||||
}
|
||||
|
||||
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(str);
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
static dl_handle * dl_load_library(const std::string & path) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return dl_load_library(converter.from_bytes(path));
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const std::string & path) {
|
||||
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
static void * dl_load_library(const std::wstring & path) {
|
||||
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@ -202,11 +217,11 @@ struct ggml_backend_registry {
|
||||
devices.push_back(device);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -214,7 +229,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -222,7 +237,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -231,16 +246,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
|
||||
register_backend(reg, std::move(handle));
|
||||
|
||||
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(path, false);
|
||||
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static std::string get_executable_path() {
|
||||
static std::wstring get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@ -401,13 +416,17 @@ static std::string get_executable_path() {
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + "/";
|
||||
#elif defined(__linux__)
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
# if defined(__linux__)
|
||||
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
||||
# elif defined(__FreeBSD__)
|
||||
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
||||
# endif
|
||||
if (len == -1) {
|
||||
break;
|
||||
}
|
||||
@ -423,57 +442,63 @@ static std::string get_executable_path() {
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return base_path + "/";
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<char> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return "";
|
||||
return {};
|
||||
}
|
||||
std::string base_path(path.data(), len);
|
||||
std::wstring base_path(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('\\');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + "\\";
|
||||
return base_path + L"\\";
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_prefix() {
|
||||
static std::wstring backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return "ggml-";
|
||||
return L"ggml-";
|
||||
#else
|
||||
return "libggml-";
|
||||
return L"libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_suffix() {
|
||||
static std::wstring backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
return ".dll";
|
||||
return L".dll";
|
||||
#else
|
||||
return ".so";
|
||||
return L".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring path_separator() {
|
||||
#ifdef _WIN32
|
||||
return L"\\";
|
||||
#else
|
||||
return L"/";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
std::vector<std::string> search_paths;
|
||||
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
std::vector<std::wstring> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back("./");
|
||||
search_paths.push_back(L"." + path_separator());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
#if defined(_WIN32)
|
||||
search_paths.push_back(std::string(user_search_path) + "\\");
|
||||
#else
|
||||
search_paths.push_back(std::string(user_search_path) + "/");
|
||||
#endif
|
||||
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
std::string best_path;
|
||||
std::wstring best_path;
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
std::string filename = entry.path().filename().string();
|
||||
std::string ext = entry.path().extension().string();
|
||||
std::wstring filename = entry.path().filename().wstring();
|
||||
std::wstring ext = entry.path().extension().wstring();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn) {
|
||||
int s = score_fn();
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
#endif
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path().string();
|
||||
best_path = entry.path().wstring();
|
||||
}
|
||||
} else {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
||||
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path.c_str(), silent);
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return get_reg().load_backend(best_path.c_str(), silent);
|
||||
return get_reg().load_backend(best_path, silent);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
|
@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
||||
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
||||
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
|
||||
sched->splits[cur_split].n_inputs);
|
||||
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||
if (j == 0) {
|
||||
GGML_LOG_DEBUG(": ");
|
||||
}
|
||||
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
||||
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
||||
}
|
||||
|
@ -74,112 +74,96 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND
|
||||
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
||||
|
||||
message(STATUS "ARM detected")
|
||||
|
||||
if (MSVC)
|
||||
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
||||
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
||||
|
||||
message(STATUS "ARM feature DOTPROD enabled")
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
||||
|
||||
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
||||
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
|
||||
endif ()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
||||
elseif (APPLE)
|
||||
if (GGML_NATIVE)
|
||||
set(USER_PROVIDED_MARCH FALSE)
|
||||
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
|
||||
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
|
||||
set(USER_PROVIDED_MARCH TRUE)
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if (NOT USER_PROVIDED_MARCH)
|
||||
set(MARCH_FLAGS "-march=armv8.2a")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
||||
|
||||
message(STATUS "ARM feature DOTPROD enabled")
|
||||
endif ()
|
||||
|
||||
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
||||
|
||||
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
||||
endif ()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
|
||||
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
|
||||
endif ()
|
||||
endif ()
|
||||
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
|
||||
if (GGML_NATIVE)
|
||||
# -mcpu=native does not always enable all the features in some compilers,
|
||||
# so we check for them manually and enable them if available
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
|
||||
INPUT_FILE "/dev/null"
|
||||
OUTPUT_QUIET
|
||||
ERROR_VARIABLE ARM_MCPU
|
||||
RESULT_VARIABLE ARM_MCPU_RESULT
|
||||
)
|
||||
if (NOT ARM_MCPU_RESULT)
|
||||
string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
|
||||
endif()
|
||||
if ("${ARM_MCPU_FLAG}" STREQUAL "")
|
||||
set(ARM_MCPU_FLAG -mcpu=native)
|
||||
message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
|
||||
endif()
|
||||
|
||||
include(CheckCXXSourceRuns)
|
||||
|
||||
function(check_arm_feature tag code)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
|
||||
check_cxx_source_runs(
|
||||
"${code}"
|
||||
GGML_MACHINE_SUPPORTS_${tag}
|
||||
)
|
||||
if (GGML_MACHINE_SUPPORTS_${tag})
|
||||
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
|
||||
else()
|
||||
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
endfunction()
|
||||
|
||||
check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
|
||||
check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
|
||||
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
||||
|
||||
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
|
||||
# show enabled features
|
||||
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
|
||||
set(FEAT_INPUT_FILE "NUL")
|
||||
else()
|
||||
set(FEAT_INPUT_FILE "/dev/null")
|
||||
endif()
|
||||
if (GGML_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
||||
INPUT_FILE ${FEAT_INPUT_FILE}
|
||||
OUTPUT_VARIABLE ARM_FEATURE
|
||||
RESULT_VARIABLE ARM_FEATURE_RESULT
|
||||
)
|
||||
if (ARM_FEATURE_RESULT)
|
||||
message(WARNING "Failed to get ARM features")
|
||||
else()
|
||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
||||
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
||||
if (NOT ${feature_pos} EQUAL -1)
|
||||
message(STATUS "ARM feature ${feature} enabled")
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
|
||||
message(STATUS "x86 detected")
|
||||
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
@ -231,8 +215,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_AVX_VNNI)
|
||||
# MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
|
||||
#list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||
endif()
|
||||
else ()
|
||||
if (GGML_NATIVE)
|
||||
@ -339,6 +322,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
|
@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
|
||||
}
|
||||
|
||||
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
return _mm256_dpbusd_epi32(zero, ax, sy);
|
||||
#elif defined(__AVXVNNI__)
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
||||
#else
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||
@ -564,21 +567,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
||||
|
||||
for (int c = 0; c < nc; c += ncols_interleaved) {
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
float32x4_t acc = vdupq_n_f32(0);
|
||||
for (int b = 0; b < nb; b++) {
|
||||
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
|
||||
int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
|
||||
int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
|
||||
int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
|
||||
float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
|
||||
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
||||
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
||||
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
||||
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
||||
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
||||
|
||||
int8x16_t a0 = vld1q_s8(a_ptr->qs);
|
||||
int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
|
||||
float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
|
||||
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
||||
|
||||
int32x4_t ret = vdupq_n_s32(0);
|
||||
|
||||
@ -647,72 +650,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"movi v2.16b, #0x4\n"
|
||||
"movi v1.16b, #0xf0\n"
|
||||
"add %x[b_ptr], %x[b_ptr], #0x8\n"
|
||||
"1:" // Column loop
|
||||
"add x23, %x[a_ptr], #0x2\n"
|
||||
"movi v0.16b, #0x0\n"
|
||||
"mov x22, %x[nb]\n"
|
||||
"2:" // Block loop
|
||||
"ldr q31, [%x[b_ptr], #0x0]\n"
|
||||
"ldr q30, [%x[b_ptr], #0x10]\n"
|
||||
"mov x21, x23\n"
|
||||
"movi v29.4s, #0x0\n"
|
||||
"ldr q28, [%x[b_ptr], #0x20]\n"
|
||||
"ldr q27, [%x[b_ptr], #0x30]\n"
|
||||
"movi v26.4s, #0x0\n"
|
||||
"sub x20, x23, #0x2\n"
|
||||
"ld1r { v25.8h }, [x20]\n"
|
||||
"ldr q24, [%x[b_ptr], #-0x8]\n"
|
||||
"sub x22, x22, #0x1\n"
|
||||
"add x23, x23, #0x22\n"
|
||||
"ld1r { v23.2d }, [x21], #0x8\n"
|
||||
"sshl v22.16b, v31.16b, v2.16b\n"
|
||||
"sshl v16.16b, v30.16b, v2.16b\n"
|
||||
"add %x[b_ptr], %x[b_ptr], #0x48\n"
|
||||
"ld1r { v21.2d }, [x21], #0x8\n"
|
||||
"sshl v20.16b, v28.16b, v2.16b\n"
|
||||
"sshl v19.16b, v27.16b, v2.16b\n"
|
||||
"ld1r { v18.2d }, [x21], #0x8\n"
|
||||
"ld1r { v17.2d }, [x21], #0x8\n"
|
||||
"and v31.16b, v31.16b, v1.16b\n"
|
||||
"and v30.16b, v30.16b, v1.16b\n"
|
||||
".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
|
||||
".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
|
||||
"and v28.16b, v28.16b, v1.16b\n"
|
||||
"and v27.16b, v27.16b, v1.16b\n"
|
||||
"fcvtl v25.4s, v25.4h\n"
|
||||
"fcvtl v16.4s, v24.4h\n"
|
||||
".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
|
||||
".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
|
||||
"fmul v16.4s, v16.4s, v25.4s\n"
|
||||
".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
|
||||
".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
|
||||
".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
|
||||
".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
|
||||
"addp v29.4s, v29.4s, v26.4s\n"
|
||||
"scvtf v29.4s, v29.4s, #0x4\n"
|
||||
"fmla v0.4s, v29.4s, v16.4s\n"
|
||||
"cbnz x22, 2b\n"
|
||||
"sub %x[nc], %x[nc], #0x4\n"
|
||||
"str q0, [%x[res_ptr], #0x0]\n"
|
||||
"add %x[res_ptr], %x[res_ptr], #0x10\n"
|
||||
"cbnz %x[nc], 1b\n"
|
||||
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
|
||||
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
||||
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
|
||||
);
|
||||
for (int c = 0; c < nc; c += ncols_interleaved) {
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
float32x4_t acc = vdupq_n_f32(0);
|
||||
for (int b = 0; b < nb; b++) {
|
||||
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
||||
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
||||
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
||||
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
||||
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
||||
|
||||
int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
|
||||
int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
|
||||
int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
|
||||
int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
|
||||
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
||||
|
||||
int32x4_t ret0 = vdupq_n_s32(0);
|
||||
int32x4_t ret1 = vdupq_n_s32(0);
|
||||
|
||||
ret0 = vdotq_s32(ret0, b0 << 4, a0);
|
||||
ret1 = vdotq_s32(ret1, b1 << 4, a0);
|
||||
ret0 = vdotq_s32(ret0, b2 << 4, a1);
|
||||
ret1 = vdotq_s32(ret1, b3 << 4, a1);
|
||||
|
||||
ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
|
||||
ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
|
||||
ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
|
||||
ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
|
||||
|
||||
int32x4_t ret = vpaddq_s32(ret0, ret1);
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
|
||||
vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
||||
a_ptr++;
|
||||
b_ptr++;
|
||||
}
|
||||
vst1q_f32(s, acc);
|
||||
s += ncols_interleaved;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
|
@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
||||
}
|
||||
|
||||
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
||||
return _mm256_cvtepi32_ps(summed_pairs);
|
||||
#elif defined(__AVXVNNI__)
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
||||
return _mm256_cvtepi32_ps(summed_pairs);
|
||||
#else
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||
|
@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
||||
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
||||
return _mm_loadu_ps(tmp);
|
||||
}
|
||||
|
||||
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
||||
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
float arr[4];
|
||||
|
||||
_mm_storeu_ps(arr, y);
|
||||
@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(params,
|
||||
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||
nb11/ggml_type_size(src1->type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(params,
|
||||
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
|
@ -522,6 +522,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_sve()) {
|
||||
features.push_back({ "SVE", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_dotprod()) {
|
||||
features.push_back({ "DOTPROD", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_matmul_int8()) {
|
||||
features.push_back({ "MATMUL_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_sve_cnt() > 0) {
|
||||
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
||||
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
||||
|
@ -53,6 +53,8 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#else
|
||||
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
|
||||
return _mm512_fmadd_ps(a, b, c);
|
||||
}
|
||||
#endif
|
||||
#if defined(__AVX512BF16__)
|
||||
template <>
|
||||
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
|
||||
return _mm512_dpbf16_ps(c, a, b);
|
||||
}
|
||||
template <>
|
||||
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
|
||||
return _mm256_dpbf16_ps(c, a, b);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
@ -204,6 +216,7 @@ template <> inline float32x4_t load(const float *p) {
|
||||
return vld1q_f32(p);
|
||||
}
|
||||
#if !defined(_MSC_VER)
|
||||
// FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
template <> inline float16x8_t load(const ggml_fp16_t *p) {
|
||||
return vld1q_f16((const float16_t *)p);
|
||||
}
|
||||
@ -225,6 +238,13 @@ template <> inline __m256 load(const float *p) {
|
||||
}
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
template <> inline __m256 load(const ggml_bf16_t *p) {
|
||||
return _mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__F16C__)
|
||||
template <> inline __m256 load(const ggml_fp16_t *p) {
|
||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
|
||||
@ -238,8 +258,27 @@ template <> inline __m512 load(const float *p) {
|
||||
template <> inline __m512 load(const ggml_fp16_t *p) {
|
||||
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
|
||||
}
|
||||
template <> inline __m512 load(const ggml_bf16_t *p) {
|
||||
return _mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
template <> inline __m512bh load(const ggml_bf16_t *p) {
|
||||
return (__m512bh)_mm512_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m256bh load(const ggml_bf16_t *p) {
|
||||
return (__m256bh)_mm256_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m512bh load(const float *p) {
|
||||
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
|
||||
}
|
||||
template <> inline __m256bh load(const float *p) {
|
||||
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
@ -251,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FLOATING POINT MATRIX MULTIPLICATION
|
||||
|
||||
template <int M>
|
||||
static inline int64_t BLOCK_SIZE(size_t m) {
|
||||
const int64_t NB_BLOC_M = (m + M - 1) / M;
|
||||
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
|
||||
}
|
||||
|
||||
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
|
||||
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
|
||||
}
|
||||
|
||||
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
|
||||
class tinyBLAS {
|
||||
public:
|
||||
tinyBLAS(int64_t k,
|
||||
tinyBLAS(const ggml_compute_params * params, int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
TC *C, int64_t ldc)
|
||||
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
bool matmul(int64_t m, int64_t n) {
|
||||
if (k % KN != 0)
|
||||
return false;
|
||||
// compute RM for only need tile with size RM&RM-1
|
||||
#if VECTOR_REGISTERS == 32
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
#else // VECTOR_REGISTERS == 16
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
|
||||
#if VECTOR_REGISTERS == 32
|
||||
case 0x55:
|
||||
mc = 5;
|
||||
nc = 5;
|
||||
gemm<5, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x45:
|
||||
mc = 4;
|
||||
nc = 5;
|
||||
gemm<4, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x54:
|
||||
mc = 5;
|
||||
nc = 4;
|
||||
gemm<5, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x44:
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm<4, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x53:
|
||||
mc = 5;
|
||||
nc = 3;
|
||||
gemm<5, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x35:
|
||||
mc = 3;
|
||||
nc = 5;
|
||||
gemm<3, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
#else
|
||||
case 0x55:
|
||||
case 0x54:
|
||||
case 0x53:
|
||||
case 0x45:
|
||||
case 0x44:
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x35:
|
||||
#endif
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x52:
|
||||
mc = 5;
|
||||
nc = 2;
|
||||
gemm<5, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x25:
|
||||
mc = 2;
|
||||
nc = 5;
|
||||
gemm<2, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x51:
|
||||
mc = 5;
|
||||
nc = 1;
|
||||
gemm<5, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x15:
|
||||
mc = 1;
|
||||
nc = 5;
|
||||
gemm<1, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
template <int RM, int RN, int BM>
|
||||
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
|
||||
if (SIZE_N == RN) {
|
||||
return gemm<RM, RN, BM>(m, n, BN);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||
} else {
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_ASSERT(false); // we have miss something.
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
D Cv[RN][RM] = {};
|
||||
for (int64_t l = 0; l < k; l += KN)
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
|
||||
load<V>(B + ldb * (jj + j) + l),
|
||||
Cv[j][i]);
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
inline void gemm_bloc(int64_t ii, int64_t jj) {
|
||||
D Cv[RN][RM] = {};
|
||||
for (int64_t l = 0; l < k; l += KN) {
|
||||
// help compiler for op order.
|
||||
if constexpr (RM <= RN) {
|
||||
V Av[RM];
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
Av[i] = load<V>(A + lda * (ii + i) + l);
|
||||
}
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
V Bv = load<V>(B + ldb * (jj + j) + l);
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
V Bv[RN];
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
Bv[j] = load<V>(B + ldb * (jj + j) + l);
|
||||
}
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
V Av = load<V>(A + lda * (ii + i) + l);
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
static std::atomic<int64_t> current_chunk;
|
||||
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
|
||||
|
||||
// "round" bloc_size to "nearest" BN
|
||||
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
|
||||
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
|
||||
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
|
||||
const int64_t nb_job = ytiles * NB_BN;
|
||||
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
int64_t job = params->ith;
|
||||
while (job < nb_job) {
|
||||
const int64_t ii = (job % ytiles) * RM * BM;
|
||||
const int64_t jb = job / ytiles;
|
||||
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
|
||||
|
||||
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
|
||||
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
|
||||
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
|
||||
|
||||
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
|
||||
int64_t jj = jj0;
|
||||
for (; jj < jj1; jj += RN) {
|
||||
gemm_bloc<RM, RN>(ii + bi, jj);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
for (; jj < jj2; jj += RN - 1) {
|
||||
gemm_bloc<RM, RN-1>(ii + bi, jj);
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
// next step.
|
||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
return;
|
||||
}
|
||||
|
||||
const ggml_compute_params * params;
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *const C;
|
||||
@ -451,8 +461,6 @@ class tinyBLAS {
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -992,8 +1000,10 @@ class tinyBLAS_Q0_AVX {
|
||||
|
||||
inline __m256 updot(__m256i u, __m256i s) {
|
||||
__m256i res;
|
||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||
res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
|
||||
#elif defined(__AVXVNNI__)
|
||||
res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
|
||||
#else
|
||||
res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
|
||||
#endif
|
||||
@ -1656,8 +1666,9 @@ class tinyBLAS_PPC {
|
||||
* @param Ctype is GGML data type of `C`
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
||||
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int Atype, int Btype, int Ctype) {
|
||||
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
@ -1665,8 +1676,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
assert(params->nth > 0);
|
||||
assert(params->ith < params->nth);
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
if (n < 2)
|
||||
@ -1681,37 +1692,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__AVX__) || defined(__AVX2__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__ARM_NEON)
|
||||
if (n < 4)
|
||||
return false;
|
||||
if (k % 4)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__MMA__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
@ -1719,7 +1718,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1727,60 +1726,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_BF16: {
|
||||
#if defined(__AVX512BF16__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__AVX512F__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__AVX2__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||
(const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||
(const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if (n < 8)
|
||||
return false;
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return false;
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if (k % 4)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
if (Btype == GGML_TYPE_F32) {
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q8_0: {
|
||||
@ -1791,7 +1801,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
@ -1799,7 +1809,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1815,7 +1825,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
@ -1823,7 +1833,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1839,7 +1849,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q5_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1855,7 +1865,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_iq4_nl *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1867,6 +1877,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
return false;
|
||||
}
|
||||
|
||||
(void)params;
|
||||
(void)m;
|
||||
(void)n;
|
||||
(void)k;
|
||||
@ -1876,8 +1887,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
(void)ldb;
|
||||
(void)C;
|
||||
(void)ldc;
|
||||
(void)ith;
|
||||
(void)nth;
|
||||
(void)Atype;
|
||||
(void)Btype;
|
||||
(void)Ctype;
|
||||
|
@ -5,8 +5,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||
const void *, int64_t, void *, int64_t, int, int,
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
||||
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
||||
int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -551,6 +551,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// expose GGUF internals for test code
|
||||
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
||||
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
||||
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -103,3 +103,19 @@ else()
|
||||
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
)
|
||||
endif() # GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||
install(
|
||||
FILES src/ggml-metal/ggml-metal.metal
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
GROUP_READ
|
||||
WORLD_READ
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
)
|
||||
endif()
|
||||
|
@ -2067,8 +2067,8 @@ static void ggml_metal_encode_node(
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
const uint r2 = ne12/ne02;
|
||||
const uint r3 = ne13/ne03;
|
||||
const uint32_t r2 = ne12/ne02;
|
||||
const uint32_t r3 = ne13/ne03;
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
|
@ -11,6 +11,8 @@
|
||||
//
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
int get_current_device_id() {
|
||||
@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||
const ggml_sycl_op_flatten_t op) try {
|
||||
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
if(use_src1)
|
||||
GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
||||
GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = (float *) src0->data;
|
||||
|
@ -26,7 +26,11 @@
|
||||
|
||||
#define GGML_COMMON_DECL_SYCL
|
||||
#define GGML_COMMON_IMPL_SYCL
|
||||
/* suppress warning spam */
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
||||
#include "ggml-common.h"
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
void* ggml_sycl_host_malloc(size_t size);
|
||||
void ggml_sycl_host_free(void* ptr);
|
||||
|
@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
if (tensor->view_src != NULL) {
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
tensor->extra = tensor->view_src->extra;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
auto dev_count = ggml_backend_sycl_get_device_count();
|
||||
|
||||
if (device>=dev_count or device<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device, dev_count-1);
|
||||
GGML_ASSERT(device<dev_count);
|
||||
}
|
||||
@ -567,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
|
||||
|
||||
int device = ctx->device;
|
||||
if (device>=ggml_sycl_info().device_count or device<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device, ggml_sycl_info().device_count-1);
|
||||
GGML_ASSERT(device<ggml_sycl_info().device_count);
|
||||
}
|
||||
@ -746,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
|
||||
// FIXME: do not crash if cudaMalloc fails
|
||||
// FIXME: do not crash if SYCL Buffer alloc fails
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_sycl_set_device(i);
|
||||
const queue_ptr stream = ctx->streams[i];
|
||||
@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
|
||||
}
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||
|
||||
dpct::memcpy_direction kind;
|
||||
char * src_ptr;
|
||||
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
kind = dpct::host_to_device;
|
||||
//GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
|
||||
src_ptr = (char *) src->data;
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
||||
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
} else if (ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
// If buffer is a SYCL buffer
|
||||
//GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
|
||||
kind = dpct::device_to_device;
|
||||
src_ptr = (char *) src->data;
|
||||
} else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
|
||||
/*
|
||||
If buffer is a SYCL split buffer
|
||||
*/
|
||||
//GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
|
||||
GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
|
||||
kind = dpct::device_to_device;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
int id;
|
||||
@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
|
||||
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
ggml_tensor *dst) try {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||
GGML_UNUSED(reg);
|
||||
|
||||
// TODO: update to the current function signature
|
||||
//if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
// return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
//}
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
}
|
||||
|
||||
// SYCL doesn't support registering host memory, left here for reference
|
||||
// "ggml_backend_register_host_buffer"
|
||||
|
@ -145,6 +145,8 @@ class vk_perf_logger;
|
||||
#endif
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
|
||||
struct vk_device_struct {
|
||||
std::mutex mutex;
|
||||
|
||||
@ -202,8 +204,8 @@ struct vk_device_struct {
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
||||
@ -411,7 +413,7 @@ struct vk_op_unary_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t d_offset;
|
||||
uint32_t misalign_offsets;
|
||||
float param1; float param2;
|
||||
uint32_t ne0_012mp; uint32_t ne0_012L;
|
||||
uint32_t ne0_01mp; uint32_t ne0_01L;
|
||||
@ -459,7 +461,7 @@ struct vk_op_binary_push_constants {
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
||||
uint32_t d_offset;
|
||||
uint32_t misalign_offsets;
|
||||
float param1; float param2; int32_t param3;
|
||||
};
|
||||
|
||||
@ -546,7 +548,7 @@ struct vk_staging_memcpy {
|
||||
};
|
||||
|
||||
struct vk_op_upscale_push_constants {
|
||||
uint32_t ne; uint32_t d_offset;
|
||||
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
@ -1404,10 +1406,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
// spec constants and tile sizes for non-quant matmul/matmul_id
|
||||
l_warptile = { 256, 128, 256, 64 };
|
||||
m_warptile = { 256, 128, 128, 64 };
|
||||
s_warptile = { 128, 32, 16, 64 };
|
||||
s_warptile = { 128, 64, 64, 64 };
|
||||
l_wg_denoms = {128, 256, 1 };
|
||||
m_wg_denoms = {128, 128, 1 };
|
||||
s_wg_denoms = { 32, 16, 1 };
|
||||
s_wg_denoms = { 64, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
||||
l_warptile_mmq = { 256, 128, 256, 64 };
|
||||
@ -1855,53 +1857,60 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
// mul mat vec
|
||||
|
||||
// AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
|
||||
uint32_t rm = 1;
|
||||
if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||
rm = 2;
|
||||
// the number of rows computed per shader depends on GPU model and quant
|
||||
uint32_t rm_stdq = 1;
|
||||
uint32_t rm_kq = 2;
|
||||
if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||
if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
|
||||
rm_stdq = 2;
|
||||
rm_kq = 4;
|
||||
}
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||
rm_stdq = 2;
|
||||
|
||||
// computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1), mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
||||
|
||||
// dequant shaders
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
@ -2012,11 +2021,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
@ -2887,9 +2896,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||
return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
|
||||
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
||||
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols);
|
||||
|
||||
switch (a_type) {
|
||||
case GGML_TYPE_F32:
|
||||
@ -2910,7 +2920,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
|
||||
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type][num_cols-1];
|
||||
}
|
||||
|
||||
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
|
||||
@ -3205,8 +3215,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf;
|
||||
size_t buf_offset;
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
||||
|
||||
const uint64_t ne0 = tensor->ne[0];
|
||||
@ -3269,7 +3279,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
|
||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||
@ -3302,7 +3312,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
||||
|
||||
if (buf != nullptr) {
|
||||
@ -3344,7 +3354,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
||||
copy_size};
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
|
||||
if (width == spitch) {
|
||||
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
||||
@ -3400,7 +3410,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
||||
|
||||
// Check if dst is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
@ -3480,7 +3490,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
||||
|
||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||
|
||||
vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
|
||||
vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||
@ -3732,9 +3742,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||
|
||||
vk_buffer d_Qx;
|
||||
vk_buffer d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer d_Qy;
|
||||
vk_buffer d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
@ -3920,8 +3930,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const uint64_t ne12 = src1->ne[2];
|
||||
const uint64_t ne13 = src1->ne[3];
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const uint64_t ne20 = dst->ne[0];
|
||||
const uint64_t ne21 = dst->ne[1];
|
||||
const uint64_t ne22 = dst->ne[2];
|
||||
@ -3930,13 +3938,18 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
// batch_n indicates that we need to compute a few vector results, and this assumes
|
||||
// ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides.
|
||||
GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1);
|
||||
bool batch_n = ne11 > 1;
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||
|
||||
vk_buffer d_Qx;
|
||||
vk_buffer d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer d_Qy;
|
||||
vk_buffer d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
@ -3980,7 +3993,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11);
|
||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||
GGML_ASSERT(dmmv != nullptr);
|
||||
@ -4052,8 +4065,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
}
|
||||
|
||||
uint32_t stride_batch_x = ne00*ne01;
|
||||
uint32_t stride_batch_y = ne10*ne11;
|
||||
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
|
||||
uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01;
|
||||
uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11);
|
||||
uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21);
|
||||
|
||||
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
||||
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
||||
@ -4076,7 +4091,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
// compute
|
||||
const vk_mat_vec_push_constants pc = {
|
||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
||||
stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||
};
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@ -4112,7 +4127,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||
|
||||
vk_buffer d_Qy;
|
||||
vk_buffer d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
@ -4256,7 +4271,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
|
||||
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
|
||||
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
|
||||
} else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||
// mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
|
||||
// when ne12 and ne13 are one.
|
||||
} else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
||||
} else {
|
||||
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
||||
@ -4300,11 +4318,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||
|
||||
vk_buffer d_Qx;
|
||||
vk_buffer d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer d_Qy;
|
||||
vk_buffer d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
vk_buffer d_ids;
|
||||
vk_buffer d_ids = nullptr;
|
||||
size_t ids_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
@ -4505,11 +4523,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||
|
||||
vk_buffer d_Qx;
|
||||
vk_buffer d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer d_Qy;
|
||||
vk_buffer d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
vk_buffer d_ids;
|
||||
vk_buffer d_ids = nullptr;
|
||||
size_t ids_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
@ -4768,8 +4786,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
vk_buffer d_Q, d_K, d_V, d_D, d_M;
|
||||
uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset;
|
||||
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
|
||||
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
|
||||
|
||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
||||
|
||||
@ -5071,6 +5089,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
|
||||
{
|
||||
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
||||
}
|
||||
|
||||
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
GGML_UNUSED(p);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(dst);
|
||||
static_assert(!std::is_const<T>::value, "unexpected type");
|
||||
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
|
||||
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
|
||||
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
|
||||
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
||||
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.a_offset = a_offset;
|
||||
p.d_offset = d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template<typename PC>
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
@ -5174,8 +5243,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
}
|
||||
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
||||
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
|
||||
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
if(!src0_uma) {
|
||||
d_X = src0_buf_ctx->dev_buffer;
|
||||
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
||||
@ -5191,6 +5259,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
||||
GGML_ASSERT(d_Z != nullptr);
|
||||
}
|
||||
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
|
||||
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
|
||||
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0);
|
||||
@ -5378,7 +5452,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||
|
||||
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
||||
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
||||
@ -5390,7 +5463,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
d_offset,
|
||||
0,
|
||||
0.0f, 0.0f, offset,
|
||||
}, dryrun);
|
||||
}
|
||||
@ -5474,8 +5547,8 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
|
||||
uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
|
||||
vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
|
||||
size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
|
||||
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
@ -5594,7 +5667,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
||||
|
||||
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
||||
(uint32_t)ggml_nelements(dst), 0,
|
||||
(uint32_t)ggml_nelements(dst), 0, 0,
|
||||
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
||||
sf0, sf1, sf2, sf3,
|
||||
@ -5704,13 +5777,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
d_offset,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}, dryrun);
|
||||
|
@ -21,9 +21,9 @@ void main() {
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
} else {
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,7 @@ void main() {
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
|
@ -12,6 +12,6 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
}
|
||||
|
@ -30,12 +30,12 @@ void main() {
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
||||
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
|
||||
#else
|
||||
if (is_src0) {
|
||||
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
|
||||
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
|
||||
} else {
|
||||
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
|
||||
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -19,9 +19,9 @@ void main() {
|
||||
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||
#else
|
||||
data_d[p.d_offset + idx] = data_a[idx];
|
||||
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
@ -32,9 +32,9 @@ void main() {
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||
#else
|
||||
data_d[p.d_offset + idx] = data_a[idx];
|
||||
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
|
@ -13,8 +13,8 @@ void main() {
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
#else
|
||||
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
|
||||
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
|
||||
#endif
|
||||
}
|
||||
|
@ -12,6 +12,6 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
|
||||
}
|
||||
|
@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
|
||||
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
|
||||
qs >>= shift;
|
||||
qs &= 0xF;
|
||||
qs &= 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
|
||||
return ret;
|
||||
}
|
||||
@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
|
||||
block_q4_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
|
||||
block_q4_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
const uint n = iqs / 64; // 0,1,2,3
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 32); // 0..127
|
||||
|
||||
const f16vec2 loadd = bl.block.d;
|
||||
|
||||
@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
|
||||
const float16_t d = loadd.x * float16_t(sc);
|
||||
const float16_t m = loadd.y * float16_t(mbyte);
|
||||
|
||||
uint32_t dmask = 0xF << (b * 4);
|
||||
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
|
||||
float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m;
|
||||
float16_t ret = d * float16_t(qs) - m;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
|
||||
block_q5_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
|
||||
block_q5_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
const uint n = iqs / 64; // 0,1,2,3
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 32); // 0..127
|
||||
const uint qhi = (iqs % 32); // 0..31
|
||||
|
||||
const uint8_t hm = uint8_t(1 << (iqs / 32));
|
||||
const uint32_t hm = 0x0101 << is;
|
||||
|
||||
const f16vec2 loadd = bl.block.d;
|
||||
|
||||
@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
|
||||
const float16_t d = loadd.x * float16_t(sc);
|
||||
const float16_t m = loadd.y * float16_t(mbyte);
|
||||
|
||||
uint32_t dmask = 0xF << (b * 4);
|
||||
uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
|
||||
qh = qh & hm;
|
||||
qh = unpack8(qh)[idx & 1];
|
||||
|
||||
float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m;
|
||||
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
|
||||
float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_
|
||||
block_q6_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
|
||||
block_q6_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
const uint n = iqs / 128; // 0,1
|
||||
const uint b = (iqs % 128) / 64; // 0,1
|
||||
const uint is_b = (iqs % 32) / 16; // 0,1
|
||||
const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
|
||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||
const uint qsi = n * 64 + (iqs % 64); // 0..127
|
||||
const uint qhi = n * 32 + (iqs % 32); // 0..63
|
||||
const uint b = (idx & 0x40) >> 6; // 0,1
|
||||
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
|
||||
const uint is = (idx & 0xF0) >> 4; // 0..15
|
||||
|
||||
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
||||
|
||||
float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32);
|
||||
uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
|
||||
ql = (ql >> (b * 4)) & 0x0F0F;
|
||||
|
||||
uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
|
||||
qh = ((qh >> qhshift) & 0x0303) << 4;
|
||||
|
||||
int q = unpack8(ql | qh)[idx & 1];
|
||||
|
||||
float16_t ret = dscale * float16_t(q - 32);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ void main() {
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||
uint d_offset;
|
||||
uint misalign_offsets;
|
||||
float param1; float param2; int param3;
|
||||
} p;
|
||||
|
||||
@ -22,6 +22,10 @@ uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFF; }
|
||||
|
||||
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
||||
uint fastmod(uint a, uint b) {
|
||||
if ((b & (b-1)) == 0) {
|
||||
|
@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
|
||||
uint ne;
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint d_offset;
|
||||
uint misalign_offsets;
|
||||
float param1; float param2;
|
||||
|
||||
uint ne0_012mp; uint ne0_012L;
|
||||
@ -24,6 +24,9 @@ uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||
|
||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||
uint fastdiv(uint n, uint mp, uint L) {
|
||||
uint msbs, lsbs;
|
||||
|
@ -15,10 +15,10 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
|
||||
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_spirv_intrinsics: enable
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#if RTE16
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
@ -23,40 +24,64 @@ layout (push_constant) uniform parameter
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#define BLOCK_SIZE 256
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
const uint NUM_ITER = 512 / BLOCK_SIZE;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.x;
|
||||
if (i >= p.pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||
const uint kx = i / ksize;
|
||||
const uint kd = kx * ksize;
|
||||
const uint ky = (i - kd) / p.OW;
|
||||
const uint ix = i % p.OW;
|
||||
const uint gidx = gl_GlobalInvocationID.x;
|
||||
|
||||
const uint oh = gl_GlobalInvocationID.y;
|
||||
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||
|
||||
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
|
||||
const uint offset_dst =
|
||||
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
|
||||
data_d[offset_dst] = D_TYPE(0.0f);
|
||||
} else {
|
||||
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
|
||||
A_TYPE values[NUM_ITER];
|
||||
uint offset_dst[NUM_ITER];
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
values[idx] = A_TYPE(0);
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
|
||||
const uint i = gidx * NUM_ITER + idx;
|
||||
|
||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||
const uint kx = i / ksize;
|
||||
const uint kd = kx * ksize;
|
||||
const uint ky = (i - kd) / p.OW;
|
||||
const uint ix = i % p.OW;
|
||||
|
||||
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
|
||||
offset_dst[idx] =
|
||||
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||
|
||||
if (i >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (iih < p.IH && iiw < p.IW) {
|
||||
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||
values[idx] = data_a[offset_src + iih * p.IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
|
||||
const uint i = gidx * NUM_ITER + idx;
|
||||
|
||||
if (i >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ void main() {
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
|
@ -9,9 +9,6 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
|
||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||
#define K_PER_ITER 8
|
||||
#else
|
||||
@ -21,70 +18,70 @@ layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
|
||||
uint a_offset, b_offset, d_offset, y_offset;
|
||||
|
||||
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||
{
|
||||
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = col - col%QUANT_K; // y block start index
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = col - col%QUANT_K; // y block start index
|
||||
|
||||
#if K_PER_ITER == 8
|
||||
#if QUANT_R == 2
|
||||
const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
||||
const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
|
||||
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
||||
const B_TYPE_VEC4 bv02 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
|
||||
const B_TYPE_VEC4 bv13 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4];
|
||||
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
||||
#else
|
||||
const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
|
||||
const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
|
||||
const vec4 bv0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
|
||||
const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
|
||||
#endif
|
||||
#else
|
||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
||||
// quantized formats since they'll be within the same block. We should
|
||||
// probably skip fetching the second element for F16/F32, but as of now we
|
||||
// still do.
|
||||
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
||||
// quantized formats since they'll be within the same block. We should
|
||||
// probably skip fetching the second element for F16/F32, but as of now we
|
||||
// still do.
|
||||
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||
|
||||
FLOAT_TYPE b0 = 0, b1 = 0;
|
||||
b0 = FLOAT_TYPE(data_b[b_offset + iybs + iqs]);
|
||||
if (!OOB) {
|
||||
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
||||
}
|
||||
FLOAT_TYPE b0 = 0, b1 = 0;
|
||||
b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
|
||||
if (!OOB) {
|
||||
b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
|
||||
}
|
||||
#endif
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib = (ibi + col)/QUANT_K; // block index
|
||||
ibi += p.ncols;
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib = (ibi + col)/QUANT_K; // block index
|
||||
ibi += p.ncols;
|
||||
|
||||
#if K_PER_ITER == 8
|
||||
vec4 v = dequantize4(ib, iqs, a_offset);
|
||||
vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
||||
vec4 v = dequantize4(ib, iqs, a_offset);
|
||||
vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
||||
|
||||
const vec2 dm = get_dm(ib, a_offset);
|
||||
if (dm.y != 0) { // quant has min component
|
||||
v = v * dm.x + dm.y;
|
||||
v2 = v2 * dm.x + dm.y;
|
||||
}
|
||||
const vec2 dm = get_dm(ib, a_offset);
|
||||
if (dm.y != 0) { // quant has min component
|
||||
v = v * dm.x + dm.y;
|
||||
v2 = v2 * dm.x + dm.y;
|
||||
}
|
||||
|
||||
// matrix multiplication
|
||||
FLOAT_TYPE rowtmp = dot(bv0, v);
|
||||
rowtmp += dot(bv1, v2);
|
||||
// matrix multiplication
|
||||
FLOAT_TYPE rowtmp = dot(bv0, v);
|
||||
rowtmp += dot(bv1, v2);
|
||||
|
||||
if (dm.y == 0)
|
||||
rowtmp *= dm.x;
|
||||
if (dm.y == 0)
|
||||
rowtmp *= dm.x;
|
||||
|
||||
temp[n] += rowtmp;
|
||||
temp[j][n] += rowtmp;
|
||||
#else
|
||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||
|
||||
// matrix multiplication
|
||||
temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
|
||||
if (!OOB) {
|
||||
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
||||
}
|
||||
// matrix multiplication
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
|
||||
if (!OOB) {
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -96,10 +93,12 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
|
||||
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||
|
||||
FLOAT_TYPE temp[NUM_ROWS];
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[i] = FLOAT_TYPE(0);
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||
@ -131,24 +130,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
i++;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[n][tid] = temp[n];
|
||||
}
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||
}
|
||||
}
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
|
@ -83,3 +83,36 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
||||
batch_idx * p.batch_stride_d;
|
||||
#endif
|
||||
}
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
layout (constant_id = 2) const uint NUM_COLS = 1;
|
||||
|
||||
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
||||
|
||||
void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
// sum up partial sums and write back result
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[j][n][tid] = temp[j][n];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,22 +5,11 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
|
||||
if (row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
@ -38,76 +27,89 @@ void main() {
|
||||
const uint s_offset = 8*v_im;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = d.x;
|
||||
const FLOAT_TYPE dmin = d.y;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = d.x;
|
||||
const FLOAT_TYPE dmin = d.y;
|
||||
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
||||
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
|
||||
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
|
||||
|
||||
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
|
||||
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
|
||||
uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
|
||||
uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
|
||||
uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
|
||||
uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
|
||||
uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
|
||||
uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
|
||||
uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
|
||||
uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
|
||||
uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
|
||||
|
||||
uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
|
||||
uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
|
||||
uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
|
||||
uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
|
||||
uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
|
||||
uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
|
||||
uvec2 qs0 = uvec2(unpack8(qs0_u16));
|
||||
uvec2 qs16 = uvec2(unpack8(qs16_u16));
|
||||
|
||||
uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
|
||||
uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
|
||||
uvec2 qs0 = uvec2(unpack8(qs0_u16));
|
||||
uvec2 qs16 = uvec2(unpack8(qs16_u16));
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
|
||||
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
|
||||
}
|
||||
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
||||
}
|
||||
}
|
||||
temp = fma(dall, sum1, fma(-dmin, sum2, temp));
|
||||
}
|
||||
|
||||
tmp[gl_LocalInvocationID.x] = temp;
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
@ -5,22 +5,11 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
|
||||
if (row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
@ -35,66 +24,80 @@ void main() {
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4 * v_im));
|
||||
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
const uint s_shift = 4 * v_im;
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
||||
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
|
||||
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
|
||||
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
|
||||
uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
|
||||
uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
|
||||
uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
|
||||
u8vec2 s0 = unpack8(s0_16);
|
||||
u8vec2 s2 = unpack8(s2_16);
|
||||
u8vec2 s4 = unpack8(s4_16);
|
||||
u8vec2 s6 = unpack8(s6_16);
|
||||
u8vec2 s8 = unpack8(s8_16);
|
||||
u8vec2 s10 = unpack8(s10_16);
|
||||
|
||||
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
|
||||
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
|
||||
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
|
||||
uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
|
||||
uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
|
||||
uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
|
||||
u8vec2 s0 = unpack8(s0_16);
|
||||
u8vec2 s2 = unpack8(s2_16);
|
||||
u8vec2 s4 = unpack8(s4_16);
|
||||
u8vec2 s6 = unpack8(s6_16);
|
||||
u8vec2 s8 = unpack8(s8_16);
|
||||
u8vec2 s10 = unpack8(s10_16);
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
||||
}
|
||||
temp[j][n] = fma(d, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
temp = fma(d, sum, temp);
|
||||
}
|
||||
|
||||
tmp[gl_LocalInvocationID.x] = temp;
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
@ -6,22 +6,11 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
|
||||
if (row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
@ -31,8 +20,8 @@ void main() {
|
||||
|
||||
const uint step = 4;
|
||||
|
||||
const uint il = itid/step; // 0...3
|
||||
const uint ir = itid - step*il; // 0...7 or 0...3
|
||||
const uint il = itid/step; // 0...3
|
||||
const uint ir = itid - step*il; // 0...7 or 0...3
|
||||
const uint n = 4;
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
@ -42,90 +31,103 @@ void main() {
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
|
||||
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||
|
||||
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
|
||||
uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
|
||||
uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
|
||||
uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
|
||||
uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
|
||||
uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
|
||||
uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
|
||||
uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
|
||||
|
||||
const uint32_t q4_0 = qs0_lo4.x;
|
||||
const uint32_t q4_1 = qs0_lo4.y;
|
||||
const uint32_t q4_2 = qs0_lo4.z;
|
||||
const uint32_t q4_3 = qs0_lo4.w;
|
||||
const uint32_t q4_4 = qs0_hi4.x;
|
||||
const uint32_t q4_5 = qs0_hi4.y;
|
||||
const uint32_t q4_6 = qs0_hi4.z;
|
||||
const uint32_t q4_7 = qs0_hi4.w;
|
||||
const uint32_t q4_8 = qs64_lo4.x;
|
||||
const uint32_t q4_9 = qs64_lo4.y;
|
||||
const uint32_t q4_10 = qs64_lo4.z;
|
||||
const uint32_t q4_11 = qs64_lo4.w;
|
||||
const uint32_t q4_12 = qs64_hi4.x;
|
||||
const uint32_t q4_13 = qs64_hi4.y;
|
||||
const uint32_t q4_14 = qs64_hi4.z;
|
||||
const uint32_t q4_15 = qs64_hi4.w;
|
||||
const uint32_t q4_0 = qs0_lo4.x;
|
||||
const uint32_t q4_1 = qs0_lo4.y;
|
||||
const uint32_t q4_2 = qs0_lo4.z;
|
||||
const uint32_t q4_3 = qs0_lo4.w;
|
||||
const uint32_t q4_4 = qs0_hi4.x;
|
||||
const uint32_t q4_5 = qs0_hi4.y;
|
||||
const uint32_t q4_6 = qs0_hi4.z;
|
||||
const uint32_t q4_7 = qs0_hi4.w;
|
||||
const uint32_t q4_8 = qs64_lo4.x;
|
||||
const uint32_t q4_9 = qs64_lo4.y;
|
||||
const uint32_t q4_10 = qs64_lo4.z;
|
||||
const uint32_t q4_11 = qs64_lo4.w;
|
||||
const uint32_t q4_12 = qs64_hi4.x;
|
||||
const uint32_t q4_13 = qs64_hi4.y;
|
||||
const uint32_t q4_14 = qs64_hi4.z;
|
||||
const uint32_t q4_15 = qs64_hi4.w;
|
||||
|
||||
B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4];
|
||||
B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4];
|
||||
B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC4 by10 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4];
|
||||
B_TYPE_VEC4 by132 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by20 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4];
|
||||
B_TYPE_VEC4 by232 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8];
|
||||
|
||||
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
|
||||
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
|
||||
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
|
||||
const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
|
||||
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
|
||||
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
|
||||
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
|
||||
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
|
||||
}
|
||||
|
||||
tmp[gl_LocalInvocationID.x] = temp;
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
|
||||
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
|
||||
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
|
||||
const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
|
||||
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
|
||||
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
|
||||
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
|
||||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
@ -6,22 +6,11 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
|
||||
if (row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
@ -39,122 +28,135 @@ void main() {
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
|
||||
uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
|
||||
uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
|
||||
uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
|
||||
uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
|
||||
|
||||
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
|
||||
uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
|
||||
|
||||
uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
|
||||
uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
|
||||
uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
|
||||
uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
|
||||
uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
|
||||
uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
|
||||
uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
|
||||
uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
|
||||
|
||||
qs0_16_u32_lo4 += qs0_16_lo4_offset16;
|
||||
qs0_16_u32_hi4 += qs0_16_hi4_offset16;
|
||||
qs64_80_u32_lo4 += qs64_80_lo4_offset16;
|
||||
qs64_80_u32_hi4 += qs64_80_hi4_offset16;
|
||||
qs0_16_u32_lo4 += qs0_16_lo4_offset16;
|
||||
qs0_16_u32_hi4 += qs0_16_hi4_offset16;
|
||||
qs64_80_u32_lo4 += qs64_80_lo4_offset16;
|
||||
qs64_80_u32_hi4 += qs64_80_hi4_offset16;
|
||||
|
||||
uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
|
||||
uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
|
||||
uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
|
||||
uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
|
||||
uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
|
||||
uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
|
||||
uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
|
||||
uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
|
||||
|
||||
const uint32_t q4_0 = qs0_16_lo4.x;
|
||||
const uint32_t q4_1 = qs0_16_lo4.y;
|
||||
const uint32_t q4_2 = qs0_16_lo4.z;
|
||||
const uint32_t q4_3 = qs0_16_lo4.w;
|
||||
const uint32_t q4_4 = qs0_16_hi4.x;
|
||||
const uint32_t q4_5 = qs0_16_hi4.y;
|
||||
const uint32_t q4_6 = qs0_16_hi4.z;
|
||||
const uint32_t q4_7 = qs0_16_hi4.w;
|
||||
const uint32_t q4_8 = qs64_80_lo4.x;
|
||||
const uint32_t q4_9 = qs64_80_lo4.y;
|
||||
const uint32_t q4_10 = qs64_80_lo4.z;
|
||||
const uint32_t q4_11 = qs64_80_lo4.w;
|
||||
const uint32_t q4_12 = qs64_80_hi4.x;
|
||||
const uint32_t q4_13 = qs64_80_hi4.y;
|
||||
const uint32_t q4_14 = qs64_80_hi4.z;
|
||||
const uint32_t q4_15 = qs64_80_hi4.w;
|
||||
const uint32_t q4_0 = qs0_16_lo4.x;
|
||||
const uint32_t q4_1 = qs0_16_lo4.y;
|
||||
const uint32_t q4_2 = qs0_16_lo4.z;
|
||||
const uint32_t q4_3 = qs0_16_lo4.w;
|
||||
const uint32_t q4_4 = qs0_16_hi4.x;
|
||||
const uint32_t q4_5 = qs0_16_hi4.y;
|
||||
const uint32_t q4_6 = qs0_16_hi4.z;
|
||||
const uint32_t q4_7 = qs0_16_hi4.w;
|
||||
const uint32_t q4_8 = qs64_80_lo4.x;
|
||||
const uint32_t q4_9 = qs64_80_lo4.y;
|
||||
const uint32_t q4_10 = qs64_80_lo4.z;
|
||||
const uint32_t q4_11 = qs64_80_lo4.w;
|
||||
const uint32_t q4_12 = qs64_80_hi4.x;
|
||||
const uint32_t q4_13 = qs64_80_hi4.y;
|
||||
const uint32_t q4_14 = qs64_80_hi4.z;
|
||||
const uint32_t q4_15 = qs64_80_hi4.w;
|
||||
|
||||
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
|
||||
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
|
||||
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC2 by10 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2];
|
||||
B_TYPE_VEC2 by116 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by132 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by148 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 by20 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2];
|
||||
B_TYPE_VEC2 by216 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by232 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by248 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24];
|
||||
|
||||
const FLOAT_TYPE sx =
|
||||
fma(FLOAT_TYPE(by10.x), q4_0,
|
||||
fma(FLOAT_TYPE(by10.y), q4_1,
|
||||
fma(FLOAT_TYPE(by116.x), q4_2,
|
||||
FLOAT_TYPE(by116.y) * q4_3)));
|
||||
const FLOAT_TYPE sy =
|
||||
fma(FLOAT_TYPE(by132.x), q4_4,
|
||||
fma(FLOAT_TYPE(by132.y), q4_5,
|
||||
fma(FLOAT_TYPE(by148.x), q4_6,
|
||||
FLOAT_TYPE(by148.y) * q4_7)));
|
||||
const FLOAT_TYPE sz =
|
||||
fma(FLOAT_TYPE(by20.x), q4_8,
|
||||
fma(FLOAT_TYPE(by20.y), q4_9,
|
||||
fma(FLOAT_TYPE(by216.x), q4_10,
|
||||
FLOAT_TYPE(by216.y) * q4_11)));
|
||||
const FLOAT_TYPE sw =
|
||||
fma(FLOAT_TYPE(by232.x), q4_12,
|
||||
fma(FLOAT_TYPE(by232.y), q4_13,
|
||||
fma(FLOAT_TYPE(by248.x), q4_14,
|
||||
FLOAT_TYPE(by248.y) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
|
||||
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
||||
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
||||
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
||||
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
|
||||
}
|
||||
|
||||
tmp[gl_LocalInvocationID.x] = temp;
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
const FLOAT_TYPE sx =
|
||||
fma(FLOAT_TYPE(by10.x), q4_0,
|
||||
fma(FLOAT_TYPE(by10.y), q4_1,
|
||||
fma(FLOAT_TYPE(by116.x), q4_2,
|
||||
FLOAT_TYPE(by116.y) * q4_3)));
|
||||
const FLOAT_TYPE sy =
|
||||
fma(FLOAT_TYPE(by132.x), q4_4,
|
||||
fma(FLOAT_TYPE(by132.y), q4_5,
|
||||
fma(FLOAT_TYPE(by148.x), q4_6,
|
||||
FLOAT_TYPE(by148.y) * q4_7)));
|
||||
const FLOAT_TYPE sz =
|
||||
fma(FLOAT_TYPE(by20.x), q4_8,
|
||||
fma(FLOAT_TYPE(by20.y), q4_9,
|
||||
fma(FLOAT_TYPE(by216.x), q4_10,
|
||||
FLOAT_TYPE(by216.y) * q4_11)));
|
||||
const FLOAT_TYPE sw =
|
||||
fma(FLOAT_TYPE(by232.x), q4_12,
|
||||
fma(FLOAT_TYPE(by232.y), q4_13,
|
||||
fma(FLOAT_TYPE(by248.x), q4_14,
|
||||
FLOAT_TYPE(by248.y) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
|
||||
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
||||
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
||||
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
||||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
@ -6,22 +6,11 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
|
||||
if (row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
@ -42,69 +31,82 @@ void main() {
|
||||
const uint s_offset = 8*v_im + is;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
FLOAT_TYPE scales[4];
|
||||
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
|
||||
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
|
||||
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
|
||||
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
|
||||
FLOAT_TYPE scales[4];
|
||||
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
|
||||
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
|
||||
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
|
||||
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
|
||||
|
||||
uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
|
||||
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
|
||||
uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
|
||||
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
|
||||
|
||||
uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
|
||||
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
|
||||
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
|
||||
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
|
||||
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
|
||||
uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
|
||||
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
|
||||
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
|
||||
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
|
||||
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
|
||||
|
||||
uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
|
||||
uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
|
||||
uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
|
||||
uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
|
||||
uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
|
||||
uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
|
||||
uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
|
||||
uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
|
||||
|
||||
uvec4 q0 = uvec4(unpack8(q0_u32));
|
||||
uvec4 q1 = uvec4(unpack8(q1_u32));
|
||||
uvec4 q2 = uvec4(unpack8(q2_u32));
|
||||
uvec4 q3 = uvec4(unpack8(q3_u32));
|
||||
uvec4 q0 = uvec4(unpack8(q0_u32));
|
||||
uvec4 q1 = uvec4(unpack8(q1_u32));
|
||||
uvec4 q2 = uvec4(unpack8(q2_u32));
|
||||
uvec4 q3 = uvec4(unpack8(q3_u32));
|
||||
|
||||
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
|
||||
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
|
||||
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC4 by0 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4];
|
||||
B_TYPE_VEC4 by32 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by64 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16];
|
||||
B_TYPE_VEC4 by96 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24];
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
||||
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
|
||||
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
||||
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
||||
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
|
||||
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
||||
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
||||
}
|
||||
temp[j][n] += sum * d;
|
||||
}
|
||||
}
|
||||
temp += sum * d;
|
||||
}
|
||||
|
||||
tmp[gl_LocalInvocationID.x] = temp;
|
||||
// sum up partial sums and write back result
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
barrier();
|
||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
@ -24,5 +24,5 @@ void main() {
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
||||
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
|
||||
}
|
||||
|
@ -22,5 +22,5 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ void main() {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
|
||||
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,6 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
|
||||
}
|
||||
|
@ -12,6 +12,6 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne; uint d_offset;
|
||||
uint ne; uint a_offset; uint d_offset;
|
||||
uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
@ -32,5 +32,5 @@ void main() {
|
||||
const uint i02 = uint(i12 / p.sf2);
|
||||
const uint i03 = uint(i13 / p.sf3);
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||
}
|
||||
|
@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
|
||||
}
|
||||
|
||||
PROCESS_INFORMATION pi;
|
||||
STARTUPINFOA si = { sizeof(STARTUPINFOA) };
|
||||
STARTUPINFOA si = {};
|
||||
si.cb = sizeof(STARTUPINFOA);
|
||||
si.dwFlags = STARTF_USESTDHANDLES;
|
||||
si.hStdOutput = stdout_write;
|
||||
si.hStdError = stderr_write;
|
||||
|
291
ggml/src/ggml.c
291
ggml/src/ggml.c
@ -3760,104 +3760,10 @@ struct ggml_tensor * ggml_clamp(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d
|
||||
|
||||
static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
}
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
|
||||
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
|
||||
|
||||
result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d_ph
|
||||
|
||||
struct ggml_tensor* ggml_conv_1d_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s,
|
||||
int d) {
|
||||
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
||||
}
|
||||
|
||||
// ggml_conv_transpose_1d
|
||||
|
||||
static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
|
||||
return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
|
||||
}
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
GGML_ASSERT(ggml_is_matrix(b));
|
||||
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
||||
GGML_ASSERT(a->ne[3] == 1);
|
||||
|
||||
GGML_ASSERT(p0 == 0);
|
||||
GGML_ASSERT(d0 == 1);
|
||||
|
||||
const int64_t ne[4] = {
|
||||
ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
|
||||
a->ne[1], b->ne[2], 1,
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
int32_t params[] = { s0, p0, d0 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_TRANSPOSE_1D;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_depthwise
|
||||
|
||||
struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
}
|
||||
// ggml_conv_2d
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
@ -3874,10 +3780,11 @@ struct ggml_tensor * ggml_im2col(
|
||||
int d1,
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type) {
|
||||
if(is_2D) {
|
||||
if (is_2D) {
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||
} else {
|
||||
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
||||
//GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
||||
GGML_ASSERT(b->ne[1] == a->ne[1]);
|
||||
GGML_ASSERT(b->ne[3] == 1);
|
||||
}
|
||||
|
||||
@ -3928,6 +3835,108 @@ struct ggml_tensor * ggml_im2col_back(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d
|
||||
|
||||
struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
|
||||
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
|
||||
|
||||
result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d_ph
|
||||
|
||||
struct ggml_tensor* ggml_conv_1d_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s,
|
||||
int d) {
|
||||
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
||||
}
|
||||
|
||||
// ggml_conv_1d_dw
|
||||
|
||||
struct ggml_tensor * ggml_conv_1d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
|
||||
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
|
||||
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
|
||||
|
||||
result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d_dw_ph
|
||||
|
||||
struct ggml_tensor * ggml_conv_1d_dw_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int d0) {
|
||||
return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
|
||||
}
|
||||
|
||||
// ggml_conv_transpose_1d
|
||||
|
||||
static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
|
||||
return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
|
||||
}
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
GGML_ASSERT(ggml_is_matrix(b));
|
||||
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
||||
GGML_ASSERT(a->ne[3] == 1);
|
||||
|
||||
GGML_ASSERT(p0 == 0);
|
||||
GGML_ASSERT(d0 == 1);
|
||||
|
||||
const int64_t ne[4] = {
|
||||
ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
|
||||
a->ne[1], b->ne[2], 1,
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
int32_t params[] = { s0, p0, d0 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_TRANSPOSE_1D;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_2d
|
||||
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
// result: [N, OC, OH, OW]
|
||||
@ -3973,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
|
||||
return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
|
||||
}
|
||||
|
||||
// ggml_conv_2d_dw
|
||||
|
||||
struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_transpose_2d_p0
|
||||
|
||||
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
||||
@ -6489,7 +6523,7 @@ struct gguf_context {
|
||||
void * data;
|
||||
};
|
||||
|
||||
static size_t gguf_type_size(enum gguf_type type) {
|
||||
size_t gguf_type_size(enum gguf_type type) {
|
||||
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
||||
return GGUF_TYPE_SIZE[type];
|
||||
}
|
||||
@ -6617,13 +6651,7 @@ struct gguf_context * gguf_init_empty(void) {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
|
||||
// offset from start of file
|
||||
size_t offset = 0;
|
||||
|
||||
@ -6636,7 +6664,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
||||
if (magic[i] != GGUF_MAGIC[i]) {
|
||||
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -6647,7 +6674,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -6665,7 +6691,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6678,7 +6703,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6688,12 +6712,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
{
|
||||
const uint64_t n_kv = ctx->header.n_kv;
|
||||
|
||||
ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
|
||||
if (!ctx->kv) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
if (n_kv > 0) {
|
||||
ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
|
||||
if (!ctx->kv) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < n_kv; ++i) {
|
||||
@ -6740,7 +6765,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6748,7 +6772,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
||||
if (!kv->value.arr.data) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6760,7 +6783,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6768,7 +6790,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
|
||||
if (!kv->value.arr.data) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6799,7 +6820,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6810,7 +6830,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
||||
if (!ctx->infos) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6846,7 +6865,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6889,7 +6907,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// this tensor type support have been removed:
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6897,7 +6914,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6929,7 +6945,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
*params.ctx = ggml_init(pdata);
|
||||
if (*params.ctx == NULL) {
|
||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@ -6948,7 +6963,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
||||
fclose(file);
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
@ -6987,7 +7001,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
||||
fclose(file);
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
@ -6996,11 +7009,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct gguf_context * result = gguf_init_from_file_impl(file, params);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
|
||||
void gguf_free(struct gguf_context * ctx) {
|
||||
if (ctx == NULL) {
|
||||
return;
|
||||
@ -7460,13 +7483,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
|
||||
// fwrite(val, sizeof(char), size, file);
|
||||
//}
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
||||
/*buf.size =*/ size,
|
||||
@ -7476,7 +7493,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
GGML_FREE(buf.data);
|
||||
}
|
||||
@ -7514,7 +7531,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
||||
buf->offset += el_size;
|
||||
}
|
||||
|
||||
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
// write header
|
||||
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||
|
@ -1 +1 @@
|
||||
cc1ab854224b1723961863c7f73bc7905cc22bf2
|
||||
e61b9f5df05f44714128507f31128b89c7fb3134
|
||||
|
Reference in New Issue
Block a user