mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 09:10:57 +00:00
Compare commits
20 Commits
Author | SHA1 | Date | |
---|---|---|---|
51c6961c7b | |||
503a786c9a | |||
ad4e350933 | |||
d7a9346ab1 | |||
b63d23f728 | |||
f6ce10e4a1 | |||
6cb2b86581 | |||
801d6bd809 | |||
ddf7e6a15d | |||
0d42097fd3 | |||
842b9c984c | |||
0810f02547 | |||
8c13c78f9d | |||
f31b404fcb | |||
854c0518bc | |||
c8e3968edd | |||
b358de2458 | |||
11688b262f | |||
04b9508fb3 | |||
4200430e75 |
36
.github/workflows/build.yml
vendored
36
.github/workflows/build.yml
vendored
@ -367,7 +367,9 @@ jobs:
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake git
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug \
|
||||
-DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
@ -1055,9 +1057,7 @@ jobs:
|
||||
./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
|
||||
|
||||
release:
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
||||
github.event.inputs.create_release == 'true' ||
|
||||
github.event.inputs.pre_release_tag != '' }}
|
||||
if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@ -1119,3 +1119,31 @@ jobs:
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
coreml-base-en:
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
||||
github.event.inputs.create_release == 'true' ||
|
||||
github.event.inputs.pre_release_tag != '' }}
|
||||
runs-on: macos-latest
|
||||
needs: determine-tag
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set environment variables
|
||||
id: set_vars
|
||||
run: |
|
||||
echo "MODEL_NAME=base.en" >> $GITHUB_ENV
|
||||
echo "GEN_MODEL_NAME=whisper-${{ needs.determine-tag.outputs.tag_name }}-ggml-base.en-encoder.mlmodelc" >> $GITHUB_ENV
|
||||
|
||||
- name: Download model
|
||||
run: |
|
||||
./models/download-ggml-model.sh ${{ env.MODEL_NAME }}
|
||||
|
||||
- name: Generate CoreML model
|
||||
run: |
|
||||
python3.11 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install ane_transformers openai-whisper coremltools
|
||||
./models/generate-coreml-model.sh ${{ env.MODEL_NAME }}
|
||||
|
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
project("whisper.cpp" VERSION 1.7.5)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
@ -190,8 +190,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
#include(CTest)
|
||||
#add_subdirectory(tests)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
|
@ -10,7 +10,7 @@
|
||||
> [!NOTE]
|
||||
> New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788
|
||||
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.7.5](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.5) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.4",
|
||||
"version": "1.7.5",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
@ -13,7 +13,7 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b
|
||||
|
||||
This example uses the whisper.xcframework which needs to be built first using the following command:
|
||||
```bash
|
||||
./build_xcframework.sh
|
||||
./build-xcframework.sh
|
||||
```
|
||||
|
||||
A model is also required to be downloaded and can be done using the following command:
|
||||
@ -30,4 +30,6 @@ mkdir models/ggml-base.en-encoder.mlmodelc
|
||||
|
||||
Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the model.
|
||||
That is all the needs to be done to use the Core ML model in the app. The converted model is a
|
||||
resource in the project and will be used if it is available.
|
||||
resource in the project and will be used if it is available. Note that the Core ML model is only
|
||||
used for the encoder, the decoder which is in the ggml model is still required so both need to
|
||||
be available.
|
||||
|
@ -34,7 +34,9 @@ sudo xcode-select -switch /Applications/Xcode.app/Contents/Developer
|
||||
**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
|
||||
|
||||
### Core ML support
|
||||
1. Follow all the steps in the `Usage` section, including adding the ggml model file.
|
||||
1. Follow all the steps in the `Usage` section, including adding the ggml model file.
|
||||
The ggml model file is required as the Core ML model is only used for the encoder. The
|
||||
decoder which is in the ggml model is still required.
|
||||
2. Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the
|
||||
model.
|
||||
3. Add the Core ML model (`models/ggml-base.en-encoder.mlmodelc/`) to `whisper.swiftui.demo/Resources/models` **via Xcode**.
|
||||
|
@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
|
||||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
add_subdirectory(kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
nnopbase
|
||||
opapi
|
||||
acl_op_compiler
|
||||
ascendc_kernels
|
||||
)
|
||||
|
||||
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <aclnnop/aclnn_copy.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_div.h>
|
||||
#include <aclnnop/aclnn_embedding.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
@ -58,7 +59,6 @@
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "kernels/ascendc_kernels.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
ACL_CHECK(aclDestroyIntArray(repeats));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Casts the elements of a tensor to a specified data type using the CANN backend.
|
||||
*
|
||||
* @details This function performs a type conversion on the elements of the input tensor `acl_src`
|
||||
* and stores the results in the destination tensor `acl_dst`. The conversion type is
|
||||
* determined based on the `dst` tensor's data type.
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_src The source tensor whose elements will be cast.
|
||||
* @param acl_dst The destination tensor that will store the casted elements.
|
||||
* @param dst The ggml tensor specifying the target data type.
|
||||
*/
|
||||
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_dst, ggml_tensor* dst) {
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
|
||||
ggml_cann_type_mapping(dst->type),
|
||||
acl_dst, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
GGML_ASSERT(ggml_can_repeat(src, dst));
|
||||
@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
}
|
||||
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src->extra = src_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
|
||||
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
|
||||
ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
// TODO: simplify
|
||||
if (src->type == GGML_TYPE_F16) {
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (src->type == GGML_TYPE_F32) {
|
||||
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
||||
// && nb0 == type_size)
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
// TODO: dst not contiguous
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, dst);
|
||||
}
|
||||
} else {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
|
||||
cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
} else {
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
/**
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
*/
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
|
||||
ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src0->extra = src0_extra_allocator.get();
|
||||
src1->extra = src1_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
|
||||
acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
||||
workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
|
||||
ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_index));
|
||||
ACL_CHECK(aclDestroyTensor(acl_out));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f32 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 8) != 0) {
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
||||
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f32(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f16 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 16) != 0) {
|
||||
size_t dst_len =
|
||||
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
||||
ggml_type_size(
|
||||
GGML_TYPE_F32); // out is also f32, even input is f16
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f16(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q4_0:
|
||||
aclrtlaunch_ascendc_get_row_q4_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
aclrtlaunch_ascendc_get_row_q8_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
case GGML_TYPE_Q8_0: {
|
||||
// add 1 dim for bcast mul.
|
||||
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
||||
dequant_nb[GGML_MAX_DIMS + 1];
|
||||
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
||||
*dequant_ne;
|
||||
int64_t scale_offset = 0;
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
weight_ne[0] = QK8_0;
|
||||
weight_ne[1] = src0->ne[0] / QK8_0;
|
||||
weight_nb[0] = sizeof(int8_t);
|
||||
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
weight_ne[i] = src0->ne[i - 1];
|
||||
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,1]
|
||||
scale_ne[0] = 1;
|
||||
scale_ne[1] = src0->ne[0] / QK8_0;
|
||||
scale_nb[0] = sizeof(uint16_t);
|
||||
scale_nb[1] = scale_nb[0] * scale_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
scale_ne[i] = src0->ne[i - 1];
|
||||
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
dequant_ne = weight_ne;
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
||||
GGML_MAX_DIMS + 1);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
||||
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
|
||||
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
|
||||
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
dequant_ne = src0->ne;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(dequant_tensor));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
|
@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
(src->type != GGML_TYPE_F32 &&
|
||||
src->type != GGML_TYPE_F16)) {
|
||||
// only support F32 and F16.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
||||
// unsupport dst is not contiguous.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_CONT: {
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
|
@ -31,6 +31,8 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
@ -2296,6 +2298,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cuda_op_ssm_conv(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
ggml_cuda_op_ssm_scan(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
@ -3193,6 +3201,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
|
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
@ -0,0 +1,151 @@
|
||||
#include "ssm-conv.cuh"
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv>
|
||||
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
|
||||
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
|
||||
const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_t; i++) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv, size_t split_n_t>
|
||||
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
|
||||
const int dst_nb1, const int dst_nb2, const int nc, const int ncs,
|
||||
const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
const int bidz = blockIdx.z;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
|
||||
bidz * split_n_t * src0_nb0);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block =
|
||||
(float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < split_n_t; i++) {
|
||||
if (bidz * split_n_t + i < n_t) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
const int dst_nb2, const int nc, const int ncs, const int nr, const int n_t,
|
||||
const int n_s, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
GGML_ASSERT(nr % threads == 0);
|
||||
|
||||
if (n_t <= 32) {
|
||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||
if (nc == 4) {
|
||||
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, nc, ncs, nr, n_t,
|
||||
n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 now.");
|
||||
}
|
||||
} else {
|
||||
if (nc == 4) {
|
||||
const int split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
ssm_conv_long_token_f32<threads, 4, split_n_t>
|
||||
<<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0,
|
||||
dst_nb1, dst_nb2, nc, ncs, nr, n_t, n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 right now.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
|
||||
const int nc = src1->ne[0]; // d_conv
|
||||
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
const int nr = src0->ne[1]; // d_inner
|
||||
const int n_t = dst->ne[1]; // tokens per sequence
|
||||
const int n_s = dst->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nr);
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
|
||||
dst->nb[2], nc, ncs, nr, n_t, n_s, stream);
|
||||
}
|
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
@ -0,0 +1,155 @@
|
||||
#include "ssm-scan.cuh"
|
||||
|
||||
// #include <cuda_runtime.h>
|
||||
// static __device__ void global_to_shared(const float *src, float *dst) {
|
||||
// asm volatile("cp.async.");
|
||||
// }
|
||||
|
||||
template <size_t splitD, size_t N>
|
||||
__global__ void __launch_bounds__(splitD, 2)
|
||||
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
|
||||
const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * __restrict__ dst, const int D, const int L, const int B) {
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32;
|
||||
const int wtid = tid % 32;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
const int stride_sA = N + 1;
|
||||
const int stride_ss0 = N + 1;
|
||||
float * smem_A = smem;
|
||||
float * smem_s0 = smem_A + splitD * stride_sA;
|
||||
|
||||
const float * s0_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
const float * x_block = (const float *) ((char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * dt_block = (const float *) ((char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * A_block = (const float *) ((char *) src3 + bidy * splitD * src3_nb1);
|
||||
const float * B_block = (const float *) ((char *) src4 + (bidx * src4_nb2));
|
||||
const float * C_block = (const float *) ((char *) src5 + (bidx * src5_nb2));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
|
||||
const int stride_s0 = src0_nb1 / sizeof(float);
|
||||
const int stride_x = src1_nb1 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb1 / sizeof(float);
|
||||
const int stride_C = src5_nb1 / sizeof(float);
|
||||
const int stride_s = stride_s0;
|
||||
const int stride_y = stride_x;
|
||||
|
||||
// can N not be 16? for example 32?
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < L; i++) {
|
||||
float dt_soft_plus = dt_block[i * stride_dt + tid];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(exp(dt_soft_plus));
|
||||
}
|
||||
float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < N; j++) {
|
||||
float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
|
||||
(B_block[i * stride_B + j] * x_dt);
|
||||
sumf += state * C_block[i * stride_C + j];
|
||||
if (i == L - 1) {
|
||||
s_block[tid * stride_s + j] = state;
|
||||
} else {
|
||||
smem_s0[tid * stride_ss0 + j] = state;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3,
|
||||
const float * src4, const float * src5, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * dst, const int N, const int D, const int L, const int B, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// todo: consider D cannot be divided,does this situation exist?
|
||||
GGML_ASSERT(D % threads == 0);
|
||||
const dim3 blocks(B, (D + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (N + 1) * 2) * sizeof(float);
|
||||
if (N == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
|
||||
src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, D, L, B);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support N!=16.");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // s
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // x
|
||||
const struct ggml_tensor * src2 = dst->src[2]; // dt
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||
|
||||
// const int64_t d_state = src0->ne[0];
|
||||
// const int64_t d_inner = src0->ne[1];
|
||||
// const int64_t l = src1->ne[1];
|
||||
// const int64_t b = src0->ne[2];
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const float * src2_d = (const float *) src2->data;
|
||||
const float * src3_d = (const float *) src3->data;
|
||||
const float * src4_d = (const float *) src4->data;
|
||||
const float * src5_d = (const float *) src5->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1],
|
||||
src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream);
|
||||
}
|
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
@ -4179,7 +4179,7 @@ static void ggml_metal_encode_node(
|
||||
// ne00*(nsg)
|
||||
// each simdgroup has a full f16 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 2*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
while (true) {
|
||||
|
@ -3184,8 +3184,8 @@ kernel void kernel_flash_attn_ext(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
float S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
// TODO: see if we can utilize quad-group functions for better performance
|
||||
@ -3202,13 +3202,13 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
const bool has_mask = mask != q;
|
||||
|
||||
half slope = 1.0f;
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
|
||||
const half base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exph);
|
||||
@ -3224,14 +3224,14 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
if (has_mask) {
|
||||
// used to detect blocks full of -INF
|
||||
half smax = -INFINITY;
|
||||
float smax = -INFINITY;
|
||||
|
||||
// load the mask in shared memory
|
||||
#pragma unroll(Q)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
|
||||
|
||||
const half m = pm[ic + tiisg];
|
||||
const float m = pm[ic + tiisg];
|
||||
|
||||
ss[j*TS + C + tiisg] = m;
|
||||
smax = max(smax, m);
|
||||
@ -3327,10 +3327,10 @@ kernel void kernel_flash_attn_ext(
|
||||
// online softmax
|
||||
{
|
||||
for (ushort j = 0; j < Q; ++j) {
|
||||
const half m = M[j];
|
||||
const float m = M[j];
|
||||
|
||||
// scale and apply the logitcap / mask
|
||||
half s = ss[j*TS + tiisg]*args.scale;
|
||||
float s = ss[j*TS + tiisg]*args.scale;
|
||||
|
||||
if (args.logit_softcap != 0.0f) {
|
||||
s = args.logit_softcap*precise::tanh(s);
|
||||
@ -3341,8 +3341,8 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
M[j] = simd_max(max(M[j], s));
|
||||
|
||||
const half ms = exp(m - M[j]);
|
||||
const half vs = exp(s - M[j]);
|
||||
const float ms = exp(m - M[j]);
|
||||
const float vs = exp(s - M[j]);
|
||||
|
||||
S[j] = S[j]*ms + simd_sum(vs);
|
||||
|
||||
@ -3444,8 +3444,8 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
half S = { 0.0f };
|
||||
half M = { -__FLT16_MAX__/2 };
|
||||
float S = { 0.0f };
|
||||
float M = { -__FLT16_MAX__/2 };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
@ -3461,16 +3461,16 @@ kernel void kernel_flash_attn_ext(
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
const half S0 = ss[j*TS + 0];
|
||||
const half S1 = ss[j*TS + sg*SH + 0];
|
||||
const float S0 = ss[j*TS + 0];
|
||||
const float S1 = ss[j*TS + sg*SH + 0];
|
||||
|
||||
const half M0 = ss[j*TS + 1];
|
||||
const half M1 = ss[j*TS + sg*SH + 1];
|
||||
const float M0 = ss[j*TS + 1];
|
||||
const float M1 = ss[j*TS + sg*SH + 1];
|
||||
|
||||
M = max(M0, M1);
|
||||
|
||||
const half ms0 = exp(M0 - M);
|
||||
const half ms1 = exp(M1 - M);
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
|
||||
S = S0*ms0 + S1*ms1;
|
||||
|
||||
@ -3646,16 +3646,16 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
constexpr short DV4 = DV/4;
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
constexpr short NL = NW/NE; // note: this can be adjusted to support different head sizes and simdgroup work loads
|
||||
constexpr short SH = 2*C; // shared memory per simdgroup
|
||||
constexpr short SH = 4*C; // shared memory per simdgroup
|
||||
|
||||
const short T = DK + nsg*SH; // shared memory size per query in (half)
|
||||
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + C + Q*DK); // scratch buffer for mask
|
||||
threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV + Q*T); // scratch buffer for the results
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t
|
||||
threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
|
||||
threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV + Q*T); // scratch buffer for the results
|
||||
|
||||
// store the result for all queries in local memory (the O matrix from the paper)
|
||||
o4_t lo[DV4/NL];
|
||||
@ -3684,8 +3684,8 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S = 0.0f;
|
||||
half M = -__FLT16_MAX__/2;
|
||||
float S = 0.0f;
|
||||
float M = -__FLT16_MAX__/2;
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
const short tx = tiisg%NL;
|
||||
@ -3703,13 +3703,13 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// pointer to the mask
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31);
|
||||
|
||||
half slope = 1.0f;
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
|
||||
const half base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exph);
|
||||
@ -3799,13 +3799,13 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
// online softmax
|
||||
{
|
||||
const half m = M;
|
||||
const half s = ss[tiisg];
|
||||
const float m = M;
|
||||
const float s = ss[tiisg];
|
||||
|
||||
M = simd_max(max(M, s));
|
||||
|
||||
const half ms = exp(m - M);
|
||||
const half vs = exp(s - M);
|
||||
const float ms = exp(m - M);
|
||||
const float vs = exp(s - M);
|
||||
|
||||
S = S*ms + simd_sum(vs);
|
||||
|
||||
@ -3836,7 +3836,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
v4_t mv;
|
||||
deq_v_t4(pv4 + i/nl_v, i%nl_v, mv);
|
||||
|
||||
lo[ii/NL] += mv*ms;
|
||||
lo[ii/NL] += o4_t(float4(mv)*float4(ms));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3907,18 +3907,18 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// parallel reduce
|
||||
for (short r = nsg/2; r > 0; r >>= 1) {
|
||||
if (sgitg < r) {
|
||||
const half S0 = ss[ 0];
|
||||
const half S1 = ss[r*SH + 0];
|
||||
const float S0 = ss[ 0];
|
||||
const float S1 = ss[r*(SH/2) + 0];
|
||||
|
||||
const half M0 = ss[ 1];
|
||||
const half M1 = ss[r*SH + 1];
|
||||
const float M0 = ss[ 1];
|
||||
const float M1 = ss[r*(SH/2) + 1];
|
||||
|
||||
const half M = max(M0, M1);
|
||||
const float M = max(M0, M1);
|
||||
|
||||
const half ms0 = exp(M0 - M);
|
||||
const half ms1 = exp(M1 - M);
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
|
||||
const half S = S0*ms0 + S1*ms1;
|
||||
const float S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[0] = S;
|
||||
@ -3950,11 +3950,11 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
|
||||
//
|
||||
#define FA_TYPES \
|
||||
half4, \
|
||||
half4, \
|
||||
half4, \
|
||||
float, \
|
||||
half, half4, \
|
||||
half4, \
|
||||
half4, \
|
||||
half4, \
|
||||
float, \
|
||||
float, float4, \
|
||||
half4
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
@ -921,10 +921,33 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
||||
|
||||
// TODO: fixme: these sizes are hardcoded for now.
|
||||
// they should be allocated based on the model's size
|
||||
// and the device's max alloc size
|
||||
size_t max_alloc_size;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, NULL));
|
||||
|
||||
// Allocate intermediate buffers and images
|
||||
size_t max_A_q_d_bytes = 311164928;
|
||||
size_t max_A_s_d_bytes = 38895616;
|
||||
size_t max_B_d_bytes = 45088768;
|
||||
size_t required_A_q_d_bytes = 311164928;
|
||||
size_t required_A_s_d_bytes = 38895616;
|
||||
size_t required_B_d_bytes = 45088768;
|
||||
|
||||
// Ensure buffer sizes do not exceed the maximum allocation size
|
||||
size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, max_alloc_size);
|
||||
size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, max_alloc_size);
|
||||
size_t max_B_d_bytes = MIN(required_B_d_bytes, max_alloc_size);
|
||||
if (required_A_q_d_bytes > max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_q_d_bytes, max_A_q_d_bytes);
|
||||
}
|
||||
if (required_A_s_d_bytes > max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_s_d_bytes, max_A_s_d_bytes);
|
||||
}
|
||||
if (required_B_d_bytes > max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_B_d_bytes, max_B_d_bytes);
|
||||
}
|
||||
|
||||
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
|
||||
|
@ -23,6 +23,23 @@ ggml_add_backend_library(ggml-sycl
|
||||
../../include/ggml-sycl.h
|
||||
)
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
|
||||
find_package(IntelSYCL)
|
||||
if (IntelSYCL_FOUND)
|
||||
# Use oneAPI CMake when possible
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
|
||||
else()
|
||||
# Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl")
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
# Link against oneDNN
|
||||
find_package(DNNL)
|
||||
set(GGML_SYCL_DNNL 0)
|
||||
if(DNNL_FOUND)
|
||||
@ -62,8 +79,6 @@ if (GGML_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
@ -76,34 +91,84 @@ else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
endif()
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
if (GGML_SYCL_GRAPH)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
||||
endif()
|
||||
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
# Link against Intel oneMKL or oneMath
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
||||
# See https://github.com/uxlfoundation/oneMath/issues/654
|
||||
find_package(MKL REQUIRED)
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
else()
|
||||
if (GGML_SYCL_GRAPH)
|
||||
add_compile_definitions(GGML_SYCL_GRAPH)
|
||||
find_package(oneMath QUIET)
|
||||
if (NOT oneMath_FOUND)
|
||||
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
|
||||
# Use FetchContent to automatically pull and build oneMath
|
||||
include(FetchContent)
|
||||
set(BUILD_FUNCTIONAL_TESTS False)
|
||||
set(BUILD_EXAMPLES False)
|
||||
set(TARGET_DOMAINS blas)
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_CUBLAS_BACKEND True)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_ROCBLAS_BACKEND True)
|
||||
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
|
||||
cmake_policy(SET CMP0126 NEW)
|
||||
# Setting the device architecture is only needed and useful for AMD devices in oneMath
|
||||
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
|
||||
endif()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
function(onemath_alias target)
|
||||
if (TARGET ${target}_obj)
|
||||
# Silence verbose warnings from external libraries
|
||||
target_compile_options(${target}_obj PRIVATE -w)
|
||||
endif()
|
||||
if (TARGET ${target})
|
||||
add_library(ONEMATH::${target} ALIAS ${target})
|
||||
endif()
|
||||
endfunction()
|
||||
onemath_alias(onemath)
|
||||
onemath_alias(onemath_blas_mklcpu)
|
||||
onemath_alias(onemath_blas_mklgpu)
|
||||
onemath_alias(onemath_blas_cublas)
|
||||
onemath_alias(onemath_blas_rocblas)
|
||||
endif()
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
add_compile_definitions(GGML_SYCL_NVIDIA)
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
|
||||
|
||||
# Below oneMath compile-time dispatching is used for better performance
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
endif()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
|
||||
else()
|
||||
# Fallback to oneMath runtime dispatcher
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
endif()
|
||||
|
@ -16,9 +16,18 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <syclcompat/math.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include <map>
|
||||
|
||||
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
|
||||
#include <oneapi/mkl.hpp>
|
||||
// Allow to use the same namespace for Intel oneMKL and oneMath
|
||||
namespace oneapi {
|
||||
namespace math = mkl;
|
||||
}
|
||||
#else
|
||||
#include <oneapi/math.hpp>
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
@ -83,13 +92,32 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
|
||||
}
|
||||
|
||||
template <typename Ts> struct matrix_info_t {
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
oneapi::math::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
inline auto get_onemath_backend(sycl::queue& queue)
|
||||
#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
-> sycl::queue&
|
||||
#endif
|
||||
{
|
||||
// If the backend is known at compile-time, use oneMath backend_selector to use
|
||||
// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
|
||||
// fallback to runtime dispatching.
|
||||
#if defined(GGML_SYCL_NVIDIA)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
|
||||
#elif defined(GGML_SYCL_AMD)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
|
||||
#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
return queue;
|
||||
#else
|
||||
static_assert(false, "Unsupported backend");
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
@ -1686,26 +1714,18 @@ namespace dpct
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, int lda, const void *b,
|
||||
int ldb, const void *beta, void *c, int ldc)
|
||||
{
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
|
||||
a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
|
||||
beta_value, data_c, ldc);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
|
||||
beta_value, data_c, ldc);
|
||||
#endif
|
||||
}
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
|
||||
const void * beta, void * c, int ldc) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
|
||||
lda, data_b, ldb, beta_value, data_c, ldc);
|
||||
}
|
||||
|
||||
template <typename VecT, class BinaryOperation, class = void>
|
||||
class vectorized_binary
|
||||
@ -1735,7 +1755,7 @@ namespace dpct
|
||||
};
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
|
||||
int ldb, const void * beta, void ** c, int ldc, int batch_size,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
@ -1754,48 +1774,28 @@ namespace dpct
|
||||
matrix_info->ld_info[2] = ldc;
|
||||
matrix_info->groupsize_info = batch_size;
|
||||
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
|
||||
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
|
||||
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#else
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#endif
|
||||
sycl::event e = oneapi::math::blas::column_major::gemm_batch(
|
||||
get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
|
||||
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
|
||||
reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
|
||||
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
|
||||
reinterpret_cast<Ts *>(matrix_info->value_info + 1), reinterpret_cast<Tc **>(c),
|
||||
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
}
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void
|
||||
gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n,
|
||||
int k, const void *alpha, const void *a, int lda,
|
||||
long long int stride_a, const void *b, int ldb,
|
||||
long long int stride_b, const void *beta, void *c,
|
||||
int ldc, long long int stride_c, int batch_size)
|
||||
{
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void * a, int lda,
|
||||
long long int stride_a, const void * b, int ldb, long long int stride_b,
|
||||
const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
|
||||
alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
|
||||
batch_size);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
|
||||
stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
|
||||
stride_c, batch_size);
|
||||
#endif
|
||||
oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
|
||||
data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
|
||||
data_c, ldc, stride_c, batch_size);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
@ -2259,13 +2259,10 @@ namespace dpct
|
||||
sycl::range<3>(x, y, 1), direction);
|
||||
}
|
||||
|
||||
inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, library_data_t a_type,
|
||||
int lda, const void *b, library_data_t b_type, int ldb,
|
||||
const void *beta, void *c, library_data_t c_type, int ldc,
|
||||
library_data_t scaling_type)
|
||||
{
|
||||
inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
|
||||
int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
|
||||
library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
library_data_t scaling_type) {
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
@ -2329,9 +2326,8 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
|
||||
ldb, beta, c, ldc);
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
@ -2369,8 +2365,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
@ -2390,7 +2385,7 @@ namespace dpct
|
||||
default:
|
||||
throw std::runtime_error("the combination of data type is unsupported");
|
||||
}
|
||||
} // gemm()
|
||||
} // gemm()
|
||||
|
||||
/// Computes a batch of matrix-matrix product with general matrices.
|
||||
/// \param [in] q The queue where the routine should be executed.
|
||||
@ -2412,7 +2407,7 @@ namespace dpct
|
||||
/// \param [in] ldc Leading dimension of C.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
|
||||
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
|
||||
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
|
||||
@ -2450,7 +2445,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
@ -2458,7 +2453,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
@ -2534,15 +2529,11 @@ namespace dpct
|
||||
/// \param [in] stride_c Stride between the different C matrices.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, library_data_t a_type,
|
||||
int lda, long long int stride_a, const void *b,
|
||||
library_data_t b_type, int ldb, long long int stride_b,
|
||||
const void *beta, void *c, library_data_t c_type,
|
||||
int ldc, long long int stride_c, int batch_size,
|
||||
library_data_t scaling_type)
|
||||
{
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
|
||||
long long int stride_a, const void * b, library_data_t b_type, int ldb,
|
||||
long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
long long int stride_c, int batch_size, library_data_t scaling_type) {
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
@ -2611,20 +2602,18 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
|
||||
beta, c, ldc, stride_c, batch_size);
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
|
||||
stride_a, b, ldb, stride_b, beta, c, ldc,
|
||||
stride_c, batch_size);
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
@ -2059,8 +2059,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
*stream, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
@ -2097,17 +2097,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
#if !GGML_SYCL_DNNL
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
# ifdef GGML_SYCL_NVIDIA
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
|
||||
ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
# else
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
|
||||
dst_dd_i, ldc)));
|
||||
# endif
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
||||
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
#else
|
||||
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
@ -2836,14 +2829,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
||||
nb01 / nb00, nb02 / nb00,
|
||||
(const char *)src1_f16, dpct::library_data_t::real_half,
|
||||
nb11 / nb10, nb12 / nb10, beta,
|
||||
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
||||
ne12 * ne13, cu_compute_type)));
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
(const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
|
||||
cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
@ -2878,7 +2867,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
});
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
|
@ -367,7 +367,7 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
@ -389,7 +389,7 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
|
@ -1,8 +1,5 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include "outprod.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
@ -34,20 +31,13 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
// Handle transposition of src1
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
const oneapi::mkl::transpose src1_op =
|
||||
src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
|
||||
oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
|
||||
ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
|
||||
src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
#endif
|
||||
// Perform matrix multiplication using oneMath GEMM
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << std::endl;
|
||||
|
@ -59,7 +59,6 @@ if (Vulkan_FOUND)
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat2 is supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
|
||||
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat2 is supported by glslc")
|
||||
endif()
|
||||
else()
|
||||
@ -70,6 +69,20 @@ if (Vulkan_FOUND)
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
# Compile a test shader to determine whether GL_EXT_integer_dot_product is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_integer_dot_product.*")
|
||||
message(STATUS "GL_EXT_integer_dot_product not supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_EXT_integer_dot_product supported by glslc")
|
||||
add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
|
@ -234,6 +234,8 @@ struct vk_device_struct {
|
||||
bool float_controls_rte_fp16;
|
||||
bool subgroup_add;
|
||||
|
||||
bool integer_dot_product;
|
||||
|
||||
bool subgroup_size_control;
|
||||
uint32_t subgroup_min_size;
|
||||
uint32_t subgroup_max_size;
|
||||
@ -245,6 +247,12 @@ struct vk_device_struct {
|
||||
uint32_t coopmat_m;
|
||||
uint32_t coopmat_n;
|
||||
uint32_t coopmat_k;
|
||||
|
||||
bool coopmat_int_support;
|
||||
uint32_t coopmat_int_m;
|
||||
uint32_t coopmat_int_n;
|
||||
uint32_t coopmat_int_k;
|
||||
|
||||
bool coopmat2;
|
||||
|
||||
size_t idx;
|
||||
@ -263,10 +271,10 @@ struct vk_device_struct {
|
||||
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
||||
vk_matmul_pipeline2 pipeline_matmul_f16;
|
||||
vk_matmul_pipeline2 pipeline_matmul_f16_f32;
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_COUNT];
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_id_f32 {};
|
||||
vk_matmul_pipeline2 pipeline_matmul_id_f16;
|
||||
@ -274,6 +282,9 @@ struct vk_device_struct {
|
||||
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
vk_pipeline pipeline_quantize_q8_1;
|
||||
|
||||
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
@ -640,6 +651,13 @@ struct vk_op_rwkv_wkv7_push_constants {
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_upscale_push_constants {
|
||||
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
@ -649,13 +667,6 @@ struct vk_staging_memcpy {
|
||||
size_t n;
|
||||
};
|
||||
|
||||
struct vk_op_upscale_push_constants {
|
||||
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
};
|
||||
|
||||
struct vk_context_struct {
|
||||
vk_submission * s;
|
||||
std::vector<vk_sequence> seqs;
|
||||
@ -1598,6 +1609,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
// mulmat
|
||||
std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
|
||||
l_warptile_mmq, m_warptile_mmq, s_warptile_mmq,
|
||||
l_warptile_mmq_int, m_warptile_mmq_int, s_warptile_mmq_int,
|
||||
l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k,
|
||||
l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid;
|
||||
std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms,
|
||||
@ -1662,6 +1674,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
|
||||
|
||||
const uint32_t tm_int_l = device->coopmat_int_support ? device->coopmat_int_m : 4;
|
||||
const uint32_t tm_int_m = device->coopmat_int_support ? device->coopmat_int_m : 4;
|
||||
const uint32_t tm_int_s = device->coopmat_int_support ? device->coopmat_int_m : 2;
|
||||
const uint32_t tn_int_l = device->coopmat_int_support ? device->coopmat_int_n : 4;
|
||||
const uint32_t tn_int_m = device->coopmat_int_support ? device->coopmat_int_n : 2;
|
||||
const uint32_t tn_int_s = device->coopmat_int_support ? device->coopmat_int_n : 2;
|
||||
const uint32_t tk_int_l = device->coopmat_int_support ? device->coopmat_int_k : 1;
|
||||
const uint32_t tk_int_m = device->coopmat_int_support ? device->coopmat_int_k : 1;
|
||||
const uint32_t tk_int_s = device->coopmat_int_support ? device->coopmat_int_k : 1;
|
||||
|
||||
l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_int_l, tn_int_l, tk_int_l, subgroup_size_8 };
|
||||
m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_int_m, tn_int_m, tk_int_m, subgroup_size_8 };
|
||||
s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_int_s, tn_int_s, tk_int_s, subgroup_size_8 };
|
||||
|
||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
|
||||
s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 };
|
||||
@ -2000,6 +2026,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
||||
|
||||
#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
|
||||
|
||||
// Create 2 variants, {f16,f32} accumulator
|
||||
#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
@ -2031,6 +2065,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (device->integer_dot_product) {
|
||||
CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
}
|
||||
#endif
|
||||
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
@ -2056,6 +2100,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
#undef CREATE_MM2
|
||||
#undef CREATE_MMQ
|
||||
#undef CREATE_MM
|
||||
} else {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
@ -2073,6 +2118,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
||||
|
||||
#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
|
||||
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||
@ -2099,6 +2152,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (device->integer_dot_product) {
|
||||
CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
|
||||
}
|
||||
#endif
|
||||
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
@ -2132,7 +2195,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
uint32_t rm_stdq = 1;
|
||||
uint32_t rm_kq = 2;
|
||||
if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||
if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
|
||||
if (device->architecture == AMD_GCN) {
|
||||
rm_stdq = 2;
|
||||
rm_kq = 4;
|
||||
}
|
||||
@ -2266,6 +2329,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
if (device->subgroup_add && device->subgroup_require_full_support) {
|
||||
@ -2452,6 +2516,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
bool pipeline_robustness = false;
|
||||
bool coopmat2_support = false;
|
||||
device->coopmat_support = false;
|
||||
device->integer_dot_product = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||
@ -2477,6 +2542,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
device->integer_dot_product = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -2490,6 +2560,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
vk::PhysicalDeviceVulkan11Properties vk11_props;
|
||||
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props;
|
||||
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
@ -2524,6 +2595,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (device->integer_dot_product) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||
}
|
||||
|
||||
device->physical_device.getProperties2(&props2);
|
||||
device->properties = props2.properties;
|
||||
device->vendor_id = device->properties.vendorID;
|
||||
@ -2570,6 +2646,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->coopmat_support = false;
|
||||
}
|
||||
|
||||
device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated;
|
||||
|
||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||
|
||||
// Try to find a non-graphics compute queue and transfer-focused queues
|
||||
@ -2662,6 +2740,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_KHR_maintenance4");
|
||||
}
|
||||
|
||||
VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {};
|
||||
shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR;
|
||||
if (device->integer_dot_product) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features;
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features;
|
||||
device_extensions.push_back("VK_KHR_shader_integer_dot_product");
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||
|
||||
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
||||
@ -2831,6 +2917,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->coopmat_acc_f16_support = true;
|
||||
}
|
||||
}
|
||||
} else if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eSint8 &&
|
||||
(vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eSint8 &&
|
||||
(vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eSint32 &&
|
||||
(vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eSint32 &&
|
||||
(vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup &&
|
||||
device->coopmat_int_m == 0
|
||||
) {
|
||||
device->coopmat_int_support = true;
|
||||
device->coopmat_int_m = prop.MSize;
|
||||
device->coopmat_int_n = prop.NSize;
|
||||
device->coopmat_int_k = prop.KSize;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2935,25 +3032,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
vk::PhysicalDevice physical_device = devices[dev_num];
|
||||
std::vector<vk::ExtensionProperties> ext_props = physical_device.enumerateDeviceExtensionProperties();
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceMaintenance3Properties props3;
|
||||
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
||||
vk::PhysicalDeviceDriverProperties driver_props;
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
subgroup_props.pNext = &driver_props;
|
||||
physical_device.getProperties2(&props2);
|
||||
|
||||
vk_device_architecture arch = get_device_architecture(physical_device);
|
||||
uint32_t default_subgroup_size = get_subgroup_size("", arch);
|
||||
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
|
||||
|
||||
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
bool coopmat_support = false;
|
||||
bool coopmat2_support = false;
|
||||
bool integer_dot_product = false;
|
||||
|
||||
for (auto properties : ext_props) {
|
||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||
@ -2969,27 +3052,44 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
integer_dot_product = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
const vk_device_architecture device_architecture = get_device_architecture(physical_device);
|
||||
|
||||
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) {
|
||||
coopmat_support = false;
|
||||
}
|
||||
|
||||
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
||||
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
||||
|
||||
bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||
|
||||
vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures();
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceMaintenance3Properties props3;
|
||||
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
||||
vk::PhysicalDeviceDriverProperties driver_props;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props;
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
subgroup_props.pNext = &driver_props;
|
||||
|
||||
// Pointer to the last chain element
|
||||
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props;
|
||||
|
||||
if (integer_dot_product) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||
}
|
||||
|
||||
physical_device.getProperties2(&props2);
|
||||
|
||||
VkPhysicalDeviceFeatures2 device_features2;
|
||||
device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
|
||||
device_features2.pNext = nullptr;
|
||||
device_features2.features = (VkPhysicalDeviceFeatures)device_features;
|
||||
|
||||
VkPhysicalDeviceVulkan11Features vk11_features;
|
||||
vk11_features.pNext = nullptr;
|
||||
@ -3002,7 +3102,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
vk11_features.pNext = &vk12_features;
|
||||
|
||||
// Pointer to the last chain element
|
||||
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
||||
last_struct = (VkBaseOutStructure *)&vk12_features;
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||
@ -3014,20 +3114,39 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
||||
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
||||
}
|
||||
#endif
|
||||
|
||||
VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {};
|
||||
shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR;
|
||||
if (integer_dot_product) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features;
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features;
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
|
||||
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
||||
uint32_t default_subgroup_size = get_subgroup_size("", device_architecture);
|
||||
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
|
||||
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
integer_dot_product = integer_dot_product
|
||||
&& shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated
|
||||
&& shader_integer_dot_product_features.shaderIntegerDotProduct;
|
||||
|
||||
coopmat_support = coopmat_support
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
&& coopmat_features.cooperativeMatrix
|
||||
#endif
|
||||
&& ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
|
||||
props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());
|
||||
props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str());
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
|
||||
@ -3293,6 +3412,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||
}
|
||||
}
|
||||
|
||||
// MMQ
|
||||
if (src1_type == GGML_TYPE_Q8_1) {
|
||||
vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
|
||||
|
||||
if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return pipelines;
|
||||
}
|
||||
|
||||
if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -3585,8 +3715,6 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
|
||||
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
||||
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
||||
@ -4016,8 +4144,8 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
|
||||
return split_k;
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
@ -4042,9 +4170,9 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
|
||||
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type)->align;
|
||||
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align;
|
||||
}
|
||||
|
||||
static void ggml_vk_matmul(
|
||||
@ -4054,7 +4182,7 @@ static void ggml_vk_matmul(
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
if (split_k == 1) {
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
@ -4072,7 +4200,7 @@ static void ggml_vk_matmul(
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
|
||||
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
@ -4214,6 +4342,25 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q8_1:
|
||||
return ctx->device->pipeline_quantize_q8_1;
|
||||
default:
|
||||
std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer&& in, vk_subbuffer&& out, uint32_t ne) {
|
||||
VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")");
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 });
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
@ -4265,10 +4412,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
|
||||
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
||||
|
||||
vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]);
|
||||
bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0;
|
||||
|
||||
// Check for mmq first
|
||||
vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr;
|
||||
|
||||
if (mmp == nullptr) {
|
||||
// Fall back to f16 dequant mul mat
|
||||
mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]);
|
||||
quantize_y = false;
|
||||
}
|
||||
|
||||
const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
|
||||
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
|
||||
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
||||
|
||||
if (qx_needs_dequant) {
|
||||
// Fall back to dequant + f16 mulmat
|
||||
@ -4278,13 +4434,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
|
||||
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||
const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type)));
|
||||
const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type));
|
||||
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = padded_n * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
@ -4294,11 +4450,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
vk_pipeline to_q8_1 = nullptr;
|
||||
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
|
||||
@ -4313,6 +4470,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||
|
||||
if (quantize_y) {
|
||||
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
||||
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
@ -4326,7 +4487,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
ctx->prealloc_size_x = x_sz_upd;
|
||||
}
|
||||
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
||||
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
|
||||
ctx->prealloc_size_y = y_sz_upd;
|
||||
}
|
||||
if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
|
||||
@ -4341,6 +4502,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
||||
}
|
||||
if (quantize_y) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
|
||||
}
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
||||
}
|
||||
@ -4376,6 +4540,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
if (qy_needs_dequant) {
|
||||
d_Y = ctx->prealloc_y;
|
||||
GGML_ASSERT(d_Y->size >= y_sz * ne12 * ne13);
|
||||
} else if (quantize_y) {
|
||||
d_Y = ctx->prealloc_y;
|
||||
GGML_ASSERT(d_Y->size >= y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1));
|
||||
} else {
|
||||
d_Y = d_Qy;
|
||||
y_buf_offset = qy_buf_offset;
|
||||
@ -4392,6 +4559,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
if (y_non_contig) {
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
}
|
||||
if (quantize_y) {
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
|
||||
}
|
||||
|
||||
uint32_t stride_batch_x = ne00*ne01;
|
||||
uint32_t stride_batch_y = ne10*ne11;
|
||||
@ -4400,7 +4570,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
||||
}
|
||||
|
||||
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
||||
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) {
|
||||
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
||||
}
|
||||
|
||||
@ -6929,6 +7099,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
@ -7177,6 +7351,10 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
||||
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
@ -7236,66 +7414,198 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
free(x_chk);
|
||||
}
|
||||
|
||||
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
||||
// This does not work without ggml q8_1 quantization support
|
||||
//
|
||||
// typedef uint16_t ggml_half;
|
||||
// typedef uint32_t ggml_half2;
|
||||
//
|
||||
// #define QK8_1 32
|
||||
// typedef struct {
|
||||
// union {
|
||||
// struct {
|
||||
// ggml_half d; // delta
|
||||
// ggml_half s; // d * sum(qs[i])
|
||||
// } GGML_COMMON_AGGR_S;
|
||||
// ggml_half2 ds;
|
||||
// } GGML_COMMON_AGGR_U;
|
||||
// int8_t qs[QK8_1]; // quants
|
||||
// } block_q8_1;
|
||||
//
|
||||
// static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
||||
// VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")");
|
||||
// GGML_ASSERT(quant == GGML_TYPE_Q8_1);
|
||||
//
|
||||
// const size_t x_sz = sizeof(float) * ne;
|
||||
// const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
||||
// float * x = (float *) malloc(x_sz);
|
||||
// block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz);
|
||||
// block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz);
|
||||
// vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
// vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
//
|
||||
// for (size_t i = 0; i < ne; i++) {
|
||||
// x[i] = rand() / (float)RAND_MAX;
|
||||
// }
|
||||
//
|
||||
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
||||
//
|
||||
// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
||||
//
|
||||
// if (ctx->device->need_compiles) {
|
||||
// ggml_vk_load_shaders(ctx->device);
|
||||
// }
|
||||
//
|
||||
// ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
//
|
||||
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
||||
//
|
||||
// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
// ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
||||
// ggml_vk_ctx_end(subctx);
|
||||
//
|
||||
// auto begin = std::chrono::high_resolution_clock::now();
|
||||
//
|
||||
// ggml_vk_submit(subctx, ctx->fence);
|
||||
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
||||
// ctx->device->device.resetFences({ ctx->fence });
|
||||
//
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
//
|
||||
// double ms_quant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
// ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz);
|
||||
//
|
||||
// ggml_vk_quantize_data(x, qx_res, ne, quant);
|
||||
//
|
||||
// int first_err = -1;
|
||||
//
|
||||
// for (size_t i = 0; i < ne / 32; i++) {
|
||||
// double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
|
||||
//
|
||||
// if (first_err < 0 && error > 0.1) {
|
||||
// first_err = i;
|
||||
// }
|
||||
//
|
||||
// error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s));
|
||||
//
|
||||
// if (first_err < 0 && error > 0.1) {
|
||||
// first_err = i;
|
||||
// }
|
||||
//
|
||||
// for (size_t j = 0; j < 32; j++) {
|
||||
// uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]);
|
||||
//
|
||||
// if (first_err < 0 && error > 1) {
|
||||
// first_err = i;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl;
|
||||
//
|
||||
// if (first_err != -1) {
|
||||
// std::cerr << "first_error = " << first_err << std::endl;
|
||||
// std::cerr << "Actual result: " << std::endl << std::endl;
|
||||
// std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
|
||||
// for (size_t j = 0; j < 32; j++) {
|
||||
// std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " ";
|
||||
// }
|
||||
// std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl;
|
||||
// std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
|
||||
// for (size_t j = 0; j < 32; j++) {
|
||||
// std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " ";
|
||||
// }
|
||||
// std::cerr << std::endl;
|
||||
// }
|
||||
//
|
||||
// ggml_vk_destroy_buffer(x_buf);
|
||||
// ggml_vk_destroy_buffer(qx_buf);
|
||||
//
|
||||
// free(x);
|
||||
// free(qx);
|
||||
// free(qx_res);
|
||||
// }
|
||||
|
||||
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
|
||||
const size_t x_ne = m * k * batch;
|
||||
const size_t y_ne = k * n * batch;
|
||||
const size_t d_ne = m * n * batch;
|
||||
|
||||
vk_matmul_pipeline2 * pipelines;
|
||||
|
||||
if (mmq) {
|
||||
pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1;
|
||||
} else {
|
||||
pipelines = ctx->device->pipeline_dequant_mul_mat_mat;
|
||||
}
|
||||
|
||||
const bool fp16acc = ctx->device->fp16;
|
||||
|
||||
vk_pipeline p;
|
||||
std::string shname;
|
||||
if (shader_size == 0) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s;
|
||||
p = fp16acc ? pipelines[quant].f16acc->a_s : pipelines[quant].f32acc->a_s;
|
||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
|
||||
} else if (shader_size == 1) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m;
|
||||
p = fp16acc ? pipelines[quant].f16acc->a_m : pipelines[quant].f32acc->a_m;
|
||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
|
||||
} else if (shader_size == 2) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l;
|
||||
p = fp16acc ? pipelines[quant].f16acc->a_l : pipelines[quant].f32acc->a_l;
|
||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
|
||||
} else {
|
||||
GGML_ASSERT(0);
|
||||
}
|
||||
|
||||
const size_t kpad = ggml_vk_align_size(k, p->align);
|
||||
const size_t kpad = mmq ? 0 : ggml_vk_align_size(k, p->align);
|
||||
|
||||
if (k != kpad) {
|
||||
if (mmq || k != kpad) {
|
||||
if (shader_size == 0) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s;
|
||||
p = fp16acc ? pipelines[quant].f16acc->s : pipelines[quant].f32acc->s;
|
||||
shname = std::string(ggml_type_name(quant)) + "_S";
|
||||
} else if (shader_size == 1) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m;
|
||||
p = fp16acc ? pipelines[quant].f16acc->m : pipelines[quant].f32acc->m;
|
||||
shname = std::string(ggml_type_name(quant)) + "_M";
|
||||
} else if (shader_size == 2) {
|
||||
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l;
|
||||
p = fp16acc ? pipelines[quant].f16acc->l : pipelines[quant].f32acc->l;
|
||||
shname = std::string(ggml_type_name(quant)) + "_L";
|
||||
} else {
|
||||
GGML_ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (p == nullptr) {
|
||||
std::cerr << "error: no pipeline for ggml_vk_test_dequant_matmul " << ggml_type_name(quant) << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t x_sz = sizeof(float) * x_ne;
|
||||
const size_t y_sz = sizeof(float) * y_ne;
|
||||
const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
||||
const size_t qy_sz = mmq ? y_ne * ggml_type_size(GGML_TYPE_Q8_1)/ggml_blck_size(GGML_TYPE_Q8_1) : y_sz;
|
||||
const size_t d_sz = sizeof(float) * d_ne;
|
||||
float * x = (float *) malloc(x_sz);
|
||||
float * y = (float *) malloc(y_sz);
|
||||
void * qx = malloc(qx_sz);
|
||||
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer qy_buf = ggml_vk_create_buffer_check(ctx->device, qy_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
float * d = (float *) malloc(d_sz);
|
||||
float * d_chk = (float *) malloc(d_sz);
|
||||
|
||||
for (size_t i = 0; i < x_ne; i++) {
|
||||
x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
||||
// x[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
||||
// x[i] = i % k;
|
||||
}
|
||||
|
||||
ggml_vk_quantize_data(x, qx, x_ne, quant);
|
||||
|
||||
for (size_t i = 0; i < y_ne; i++) {
|
||||
// y[i] = rand() / (float)RAND_MAX;
|
||||
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
||||
y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
||||
// y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
||||
// y[i] = i % k;
|
||||
}
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
||||
@ -7310,6 +7620,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
if (mmq) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
|
||||
}
|
||||
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
@ -7318,13 +7635,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
if (mmq) {
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { y_buf, 0, y_sz }, { qy_buf, 0, qy_sz }, y_ne);
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, { qx_buf, 0, qx_sz }, { qy_buf, 0, qy_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k },
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, { qx_buf, 0, qx_sz }, { y_buf, 0, y_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k },
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
@ -7382,7 +7711,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
|
||||
double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
|
||||
|
||||
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
||||
std::cerr << "TEST dequant matmul " << shname;
|
||||
if (mmq) {
|
||||
std::cerr << " mmq";
|
||||
}
|
||||
std::cerr << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
||||
|
||||
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
||||
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
||||
@ -7392,6 +7725,12 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
std::cerr << "Expected result: " << std::endl << std::endl;
|
||||
ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
||||
|
||||
std::cerr << "src0: " << std::endl << std::endl;
|
||||
ggml_vk_print_matrix_area(x, GGML_TYPE_F32, k, m, first_err_m, first_err_n, first_err_b);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "src1: " << std::endl << std::endl;
|
||||
ggml_vk_print_matrix_area(y, GGML_TYPE_F32, k, n, first_err_m, first_err_n, first_err_b);
|
||||
|
||||
if (split_k > 1) {
|
||||
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
||||
ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
||||
@ -7414,6 +7753,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
|
||||
ggml_vk_destroy_buffer(qx_buf);
|
||||
ggml_vk_destroy_buffer(y_buf);
|
||||
ggml_vk_destroy_buffer(qy_buf);
|
||||
ggml_vk_destroy_buffer(d_buf);
|
||||
|
||||
free(x);
|
||||
@ -7446,7 +7786,25 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
128, 49, 49,
|
||||
4096, 49, 4096,
|
||||
};
|
||||
const size_t num_it = 100;
|
||||
const size_t num_it = 1;
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0, true);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0, true);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0, true);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0, true);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0, true);
|
||||
ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0, true);
|
||||
|
||||
abort();
|
||||
|
||||
for (size_t i = 0; i < vals.size(); i += 3) {
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
||||
@ -9258,7 +9616,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
const float *params = (const float *)tensor->op_params;
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
@ -9275,7 +9633,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
} else if (tensor->op == GGML_OP_UPSCALE) {
|
||||
tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_SCALE) {
|
||||
tensor_clone = ggml_scale(ggml_ctx, src_clone[0], ((float *)tensor->op_params)[0]);
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
|
||||
} else if (tensor->op == GGML_OP_SQR) {
|
||||
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SIN) {
|
||||
@ -9283,7 +9642,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
} else if (tensor->op == GGML_OP_COS) {
|
||||
tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_PAD) {
|
||||
tensor_clone = ggml_pad(ggml_ctx, src_clone[0], tensor->ne[0] - src_clone[0]->ne[0], tensor->ne[1] - src_clone[0]->ne[1], tensor->ne[2] - src_clone[0]->ne[2], tensor->ne[3] - src_clone[0]->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_REPEAT) {
|
||||
@ -9297,7 +9657,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
} else if (tensor->op == GGML_OP_NORM) {
|
||||
tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_GROUP_NORM) {
|
||||
tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], *(int *)tensor->op_params, ((float *)tensor->op_params)[1]);
|
||||
const float * float_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM) {
|
||||
tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
|
||||
@ -9310,14 +9671,15 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
||||
if (src1 != nullptr) {
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
|
||||
tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
} else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
|
||||
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], *(int *)tensor->op_params);
|
||||
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
|
||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||
|
@ -212,7 +212,7 @@ void main() {
|
||||
#else
|
||||
ACC_TYPE sums[WMITER * TM * WNITER * TN];
|
||||
FLOAT_TYPE cache_a[WMITER * TM];
|
||||
FLOAT_TYPE cache_b[WNITER * TN];
|
||||
FLOAT_TYPE cache_b[TN];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
|
||||
sums[i] = ACC_TYPE(0.0f);
|
||||
@ -744,16 +744,14 @@ void main() {
|
||||
}
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint j = 0; j < TN; j++) {
|
||||
cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
|
||||
cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
|
||||
sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]);
|
||||
sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[cc]), sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
444
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
Normal file
444
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
Normal file
@ -0,0 +1,444 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
|
||||
#extension GL_EXT_integer_dot_product : require
|
||||
|
||||
#ifdef FLOAT16
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
layout (binding = 1) readonly buffer B {block_q8_1_packed32 data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint M;
|
||||
uint N;
|
||||
uint K;
|
||||
uint stride_a;
|
||||
uint stride_b;
|
||||
uint stride_d;
|
||||
|
||||
uint batch_stride_a;
|
||||
uint batch_stride_b;
|
||||
uint batch_stride_d;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint nei0;
|
||||
uint nei1;
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
} p;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 64;
|
||||
layout (constant_id = 1) const uint BM = 64;
|
||||
layout (constant_id = 2) const uint BN = 64;
|
||||
// layout (constant_id = 3) const uint BK = 32;
|
||||
layout (constant_id = 4) const uint WM = 32;
|
||||
layout (constant_id = 5) const uint WN = 32;
|
||||
layout (constant_id = 6) const uint WMITER = 2;
|
||||
layout (constant_id = 7) const uint TM = 4;
|
||||
layout (constant_id = 8) const uint TN = 2;
|
||||
layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat
|
||||
layout (constant_id = 10) const uint WARP = 32;
|
||||
|
||||
#define BK 32
|
||||
|
||||
#ifdef COOPMAT
|
||||
#define SHMEM_STRIDE (BK / 4 + 4)
|
||||
#else
|
||||
#define SHMEM_STRIDE (BK / 4 + 1)
|
||||
#endif
|
||||
|
||||
shared int32_t buf_a_qs[BM * SHMEM_STRIDE];
|
||||
|
||||
#ifndef COOPMAT
|
||||
#if QUANT_AUXF == 1
|
||||
shared FLOAT_TYPE buf_a_dm[BM];
|
||||
#else
|
||||
shared FLOAT_TYPE_VEC2 buf_a_dm[BM];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
shared int32_t buf_b_qs[BN * SHMEM_STRIDE];
|
||||
#ifndef COOPMAT
|
||||
shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
|
||||
#endif
|
||||
|
||||
#define LOAD_VEC_A (4 * QUANT_R)
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[3072];
|
||||
#endif // MUL_MAT_ID
|
||||
|
||||
#define NUM_WARPS (BLOCK_SIZE / WARP)
|
||||
|
||||
#ifdef COOPMAT
|
||||
shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
||||
#endif
|
||||
|
||||
#include "mul_mmq_funcs.comp"
|
||||
|
||||
void main() {
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
|
||||
const uint i03 = i13 / p.broadcast3;
|
||||
const uint i02 = i12 / p.broadcast2;
|
||||
|
||||
const uint batch_idx_a = i03 * p.ne02 + i02;
|
||||
#endif
|
||||
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
|
||||
const uint WSUBM = WM / WMITER;
|
||||
const uint WSUBN = WN / WNITER;
|
||||
|
||||
#ifdef COOPMAT
|
||||
const uint warp_i = gl_SubgroupID;
|
||||
|
||||
const uint tiw = gl_SubgroupInvocationID;
|
||||
|
||||
const uint cms_per_row = WM / TM;
|
||||
const uint cms_per_col = WN / TN;
|
||||
|
||||
const uint storestride = WARP / TM;
|
||||
const uint store_r = tiw % TM;
|
||||
const uint store_c = tiw / TM;
|
||||
#else
|
||||
const uint warp_i = gl_LocalInvocationID.x / WARP;
|
||||
|
||||
const uint tiw = gl_LocalInvocationID.x % WARP;
|
||||
|
||||
const uint tiwr = tiw % (WSUBM / TM);
|
||||
const uint tiwc = tiw / (WSUBM / TM);
|
||||
#endif
|
||||
|
||||
const uint warp_r = warp_i % (BM / WM);
|
||||
const uint warp_c = warp_i / (BM / WM);
|
||||
|
||||
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A);
|
||||
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A);
|
||||
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B);
|
||||
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B);
|
||||
|
||||
const uint loadstride_a = BLOCK_SIZE * LOAD_VEC_A / BK;
|
||||
const uint loadstride_b = BLOCK_SIZE * LOAD_VEC_B / BK;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint _ne1 = 0;
|
||||
for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
|
||||
for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
|
||||
if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
|
||||
row_ids[_ne1] = u16vec2(ii0, ii1);
|
||||
_ne1++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
// Workgroup has no work
|
||||
if (ic * BN >= _ne1) return;
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint start_k = 0;
|
||||
const uint end_k = p.K;
|
||||
#else
|
||||
const uint start_k = ik * p.k_split;
|
||||
const uint end_k = min(p.K, (ik + 1) * p.k_split);
|
||||
#endif
|
||||
|
||||
uint pos_a_ib = (
|
||||
#ifdef MUL_MAT_ID
|
||||
expert_idx * p.batch_stride_a +
|
||||
#else
|
||||
batch_idx_a * p.batch_stride_a +
|
||||
#endif
|
||||
ir * BM * p.stride_a + start_k) / BK;
|
||||
#ifdef MUL_MAT_ID
|
||||
uint pos_b_ib = 0;
|
||||
#else
|
||||
uint pos_b_ib = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / BK;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
coopmat<int8_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a;
|
||||
coopmat<int8_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
|
||||
coopmat<int32_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_result;
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> factors[cms_per_row * cms_per_col];
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
|
||||
sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
|
||||
}
|
||||
#else
|
||||
int32_t cache_a_qs[WMITER * TM * BK / 4];
|
||||
|
||||
int32_t cache_b_qs[TN * BK / 4];
|
||||
|
||||
ACC_TYPE sums[WMITER * TM * WNITER * TN];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
|
||||
sums[i] = ACC_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if QUANT_AUXF == 1
|
||||
FLOAT_TYPE cache_a_dm[TM];
|
||||
#else
|
||||
FLOAT_TYPE_VEC2 cache_a_dm[TM];
|
||||
#endif
|
||||
|
||||
FLOAT_TYPE_VEC2 cache_b_ds[TN];
|
||||
|
||||
for (uint block = start_k; block < end_k; block += BK) {
|
||||
[[unroll]] for (uint l = 0; loadc_a + l < BM; l += loadstride_a) {
|
||||
const uint ib = pos_a_ib + (loadc_a + l) * p.stride_a / BK;
|
||||
const uint iqs = loadr_a;
|
||||
const uint buf_ib = loadc_a + l;
|
||||
|
||||
// Should ds be gated to a single thread?
|
||||
if (iqs == 0) {
|
||||
#if QUANT_AUXF == 1
|
||||
buf_a_dm[buf_ib] = get_d(ib);
|
||||
#else
|
||||
buf_a_dm[buf_ib] = get_dm(ib);
|
||||
#endif
|
||||
}
|
||||
#if QUANT_R == 1
|
||||
buf_a_qs[buf_ib * SHMEM_STRIDE + iqs] = repack(ib, iqs);
|
||||
#else
|
||||
const i32vec2 vals = repack(ib, iqs);
|
||||
buf_a_qs[buf_ib * SHMEM_STRIDE + iqs ] = vals.x;
|
||||
buf_a_qs[buf_ib * SHMEM_STRIDE + iqs + 4] = vals.y;
|
||||
#endif
|
||||
}
|
||||
[[unroll]] for (uint l = 0; loadc_b + l < BN; l += loadstride_b) {
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
|
||||
const uint idx = pos_b_ib + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x7;
|
||||
#else
|
||||
const uint ib = pos_b_ib + (loadc_b + l) * p.stride_b / BK;
|
||||
const uint iqs = loadr_b;
|
||||
#endif
|
||||
|
||||
const uint buf_ib = loadc_b + l;
|
||||
|
||||
// Should ds be gated to a single thread?
|
||||
if (iqs == 0) {
|
||||
buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds);
|
||||
}
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs] = data_b[ib].qs[iqs];
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
pos_a_ib += 1;
|
||||
pos_b_ib += 1;
|
||||
|
||||
#ifdef COOPMAT
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
const uint ib_a = warp_r * WM + cm_row * TM;
|
||||
// Load from shared into cache
|
||||
coopMatLoad(cache_a, buf_a_qs, ib_a * SHMEM_STRIDE, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
|
||||
|
||||
// TODO: only cache values that are actually needed
|
||||
[[unroll]] for (uint t_idx = 0; t_idx < TM; t_idx++) {
|
||||
cache_a_dm[t_idx] = buf_a_dm[ib_a + t_idx];
|
||||
}
|
||||
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const uint ib_b = warp_c * WN + cm_col * TN;
|
||||
coopMatLoad(cache_b, buf_b_qs, ib_b * SHMEM_STRIDE, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
// TODO: only cache values that are actually needed
|
||||
[[unroll]] for (uint t_idx = 0; t_idx < TN; t_idx++) {
|
||||
cache_b_dm[t_idx] = buf_b_d[ib_b + t_idx];
|
||||
}
|
||||
|
||||
cm_result = coopmat<int32_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0);
|
||||
cm_result = coopMatMulAdd(cache_a, cache_b, cm_result);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
coopmat_stage[warp_i * TM * TN + (store_c + col) * TM + store_r] = ACC_TYPE(float(cache_a_d[store_r]) * float(cache_b_d[store_c + col]));
|
||||
}
|
||||
|
||||
coopMatLoad(factors, coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
sums[cm_col * cms_per_row + cm_row] += factors * coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(cm_result);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// Load from shared into cache
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
const uint ib = warp_r * WM + wsir * WSUBM + tiwr * TM + cr;
|
||||
cache_a_dm[wsir * TM + cr] = buf_a_dm[ib];
|
||||
[[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
|
||||
cache_a_qs[(wsir * TM + cr) * (BK / 4) + idx_k] = buf_a_qs[ib * SHMEM_STRIDE + idx_k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
const uint ib = warp_c * WN + wsic * WSUBN + tiwc * TN + cc;
|
||||
cache_b_ds[cc] = buf_b_ds[ib];
|
||||
[[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
|
||||
cache_b_qs[cc * (BK / 4) + idx_k] = buf_b_qs[ib * SHMEM_STRIDE + idx_k];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
const uint cache_a_idx = wsir * TM + cr;
|
||||
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
|
||||
int32_t q_sum = 0;
|
||||
[[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
|
||||
q_sum += dotPacked4x8EXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
|
||||
cache_b_qs[cc * (BK / 4) + idx_k]);
|
||||
}
|
||||
|
||||
sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
}
|
||||
|
||||
const uint dr = ir * BM + warp_r * WM;
|
||||
const uint dc = ic * BN + warp_c * WN;
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#ifdef MUL_MAT_ID
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < BN; col += storestride) {
|
||||
const uint row_i = dc + cm_col * TN + col + store_c;
|
||||
if (row_i >= _ne1) break;
|
||||
|
||||
const u16vec2 row_idx = row_ids[row_i];
|
||||
|
||||
data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
const bool is_aligned = p.stride_d % 4 == 0; // Assumption: D_TYPE == float
|
||||
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N;
|
||||
|
||||
if (is_aligned && is_in_bounds) {
|
||||
// Full coopMat is within bounds and stride_d is aligned with 16B
|
||||
coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_dtype = coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(sums[cm_col * cms_per_row + cm_row]);
|
||||
coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
} else if (is_in_bounds) {
|
||||
// Full coopMat is within bounds, but stride_d is not aligned
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
} else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) {
|
||||
// Partial coopMat is within bounds
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) {
|
||||
data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // MUL_MAT_ID
|
||||
#else
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
|
||||
const uint dr_warp = dr + wsir * WSUBM + tiwr * TM;
|
||||
const uint dc_warp = dc + wsic * WSUBN + tiwc * TN;
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint row_i = dc_warp + cc;
|
||||
if (row_i >= _ne1) break;
|
||||
|
||||
const u16vec2 row_idx = row_ids[row_i];
|
||||
#endif // MUL_MAT_ID
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
#ifdef MUL_MAT_ID
|
||||
data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
|
||||
#else
|
||||
if (dr_warp + cr < p.M && dc_warp + cc < p.N) {
|
||||
data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
|
||||
}
|
||||
#endif // MUL_MAT_ID
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // COOPMAT
|
||||
}
|
99
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
Normal file
99
ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
Normal file
@ -0,0 +1,99 @@
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
// Each iqs value maps to a 32-bit integer
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
i32vec2 repack(uint ib, uint iqs) {
|
||||
// Use 2-byte loads since a q4_0 block (18 bytes) is not divisible by 4
|
||||
const u16vec2 quants = u16vec2(data_a[ib].qs[iqs * 2 ],
|
||||
data_a[ib].qs[iqs * 2 + 1]);
|
||||
const uint32_t vui = pack32(quants);
|
||||
return i32vec2( vui & 0x0F0F0F0F,
|
||||
(vui >> 4) & 0x0F0F0F0F);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0 * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_1)
|
||||
i32vec2 repack(uint ib, uint iqs) {
|
||||
// Use 4-byte loads since a q4_1 block (20 bytes) is divisible by 4
|
||||
const uint32_t vui = data_a_packed32[ib].qs[iqs];
|
||||
return i32vec2( vui & 0x0F0F0F0F,
|
||||
(vui >> 4) & 0x0F0F0F0F);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, vec2 dma, vec2 dsb) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0)
|
||||
i32vec2 repack(uint ib, uint iqs) {
|
||||
// Use 2-byte loads since a q5_0 block (22 bytes) is not divisible by 4
|
||||
const u16vec2 quants = u16vec2(data_a[ib].qs[iqs * 2 ],
|
||||
data_a[ib].qs[iqs * 2 + 1]);
|
||||
const uint32_t vui = pack32(quants);
|
||||
const int32_t qh = int32_t((uint32_t(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]) >> (4 * iqs));
|
||||
const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
|
||||
| ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
|
||||
|
||||
const int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F)
|
||||
| (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
|
||||
|
||||
return i32vec2(v0, v1);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0 * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_1)
|
||||
i32vec2 repack(uint ib, uint iqs) {
|
||||
// Use 4-byte loads since a q5_1 block (24 bytes) is divisible by 4
|
||||
const uint32_t vui = data_a_packed32[ib].qs[iqs];
|
||||
const int32_t qh = int32_t(data_a_packed32[ib].qh >> (4 * iqs));
|
||||
const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
|
||||
| ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
|
||||
|
||||
const int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F)
|
||||
| (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
|
||||
|
||||
return i32vec2(v0, v1);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, vec2 dma, vec2 dsb) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
int32_t repack(uint ib, uint iqs) {
|
||||
// Use 2-byte loads since a q8_0 block (34 bytes) is not divisible by 4
|
||||
return pack32(i16vec2(data_a[ib].qs[iqs * 2 ],
|
||||
data_a[ib].qs[iqs * 2 + 1]));
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(float(q_sum) * da * dsb.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
||||
FLOAT_TYPE get_d(uint ib) {
|
||||
return FLOAT_TYPE(data_a[ib].d);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
||||
FLOAT_TYPE_VEC2 get_dm(uint ib) {
|
||||
return FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
|
||||
}
|
||||
#endif
|
77
ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
Normal file
77
ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
Normal file
@ -0,0 +1,77 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(constant_id = 0) const uint GROUP_SIZE = 32;
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {vec4 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {block_q8_1_packed32 data_b[];};
|
||||
|
||||
shared float shmem[GROUP_SIZE];
|
||||
|
||||
void quantize() {
|
||||
const uint wgid = gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
// Each thread handles a vec4, so 8 threads handle a block
|
||||
const uint blocks_per_group = GROUP_SIZE / 8;
|
||||
|
||||
const uint block_in_wg = tid / 8;
|
||||
|
||||
const uint ib = wgid * blocks_per_group + block_in_wg;
|
||||
const uint iqs = tid % 8;
|
||||
|
||||
if (ib >= gl_NumWorkGroups.x * blocks_per_group) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint a_idx = ib * 8 + iqs;
|
||||
|
||||
vec4 vals = a_idx < p.ne ? data_a[a_idx] : vec4(0.0f);
|
||||
const vec4 abs_vals = abs(vals);
|
||||
|
||||
// Find absolute max for each block
|
||||
shmem[tid] = max(max(abs_vals.x, abs_vals.y), max(abs_vals.z, abs_vals.w));
|
||||
barrier();
|
||||
[[unroll]] for (uint s = 4; s > 0; s >>= 1) {
|
||||
if (iqs < s) {
|
||||
shmem[tid] = max(shmem[tid], shmem[tid + s]);
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float amax = shmem[block_in_wg * 8];
|
||||
const float d = amax / 127.0;
|
||||
const float d_inv = d != 0.0 ? 1.0 / d : 0.0;
|
||||
vals = round(vals * d_inv);
|
||||
data_b[ib].qs[iqs] = pack32(i8vec4(round(vals)));
|
||||
barrier();
|
||||
|
||||
// Calculate the sum for each block
|
||||
shmem[tid] = vals.x + vals.y + vals.z + vals.w;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = 4; s > 0; s >>= 1) {
|
||||
if (iqs < s) {
|
||||
shmem[tid] += shmem[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (iqs == 0) {
|
||||
const float sum = shmem[tid];
|
||||
|
||||
data_b[ib].ds = f16vec2(vec2(d, sum * d));
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
quantize();
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
#version 460
|
||||
|
||||
#extension GL_EXT_integer_dot_product : require
|
||||
|
||||
void main()
|
||||
{
|
||||
}
|
@ -1,4 +1,3 @@
|
||||
|
||||
#if !defined(GGML_TYPES_COMP)
|
||||
#define GGML_TYPES_COMP
|
||||
|
||||
@ -51,6 +50,7 @@ struct block_q4_0_packed16
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define QUANT_K QUANT_K_Q4_0
|
||||
#define QUANT_R QUANT_R_Q4_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_q4_0
|
||||
#define A_TYPE_PACKED16 block_q4_0_packed16
|
||||
#endif
|
||||
@ -72,11 +72,19 @@ struct block_q4_1_packed16
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
struct block_q4_1_packed32
|
||||
{
|
||||
f16vec2 dm;
|
||||
uint32_t qs[16/4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q4_1)
|
||||
#define QUANT_K QUANT_K_Q4_1
|
||||
#define QUANT_R QUANT_R_Q4_1
|
||||
#define QUANT_AUXF 2
|
||||
#define A_TYPE block_q4_1
|
||||
#define A_TYPE_PACKED16 block_q4_1_packed16
|
||||
#define A_TYPE_PACKED32 block_q4_1_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q5_0 32
|
||||
@ -99,6 +107,7 @@ struct block_q5_0_packed16
|
||||
#if defined(DATA_A_Q5_0)
|
||||
#define QUANT_K QUANT_K_Q5_0
|
||||
#define QUANT_R QUANT_R_Q5_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_q5_0
|
||||
#define A_TYPE_PACKED16 block_q5_0_packed16
|
||||
#endif
|
||||
@ -122,11 +131,20 @@ struct block_q5_1_packed16
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
struct block_q5_1_packed32
|
||||
{
|
||||
f16vec2 dm;
|
||||
uint qh;
|
||||
uint32_t qs[16/4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q5_1)
|
||||
#define QUANT_K QUANT_K_Q5_1
|
||||
#define QUANT_R QUANT_R_Q5_1
|
||||
#define QUANT_AUXF 2
|
||||
#define A_TYPE block_q5_1
|
||||
#define A_TYPE_PACKED16 block_q5_1_packed16
|
||||
#define A_TYPE_PACKED32 block_q5_1_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q8_0 32
|
||||
@ -142,14 +160,40 @@ struct block_q8_0_packed16
|
||||
float16_t d;
|
||||
int16_t qs[32/2];
|
||||
};
|
||||
struct block_q8_0_packed32
|
||||
{
|
||||
float16_t d;
|
||||
int32_t qs[32/4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define QUANT_K QUANT_K_Q8_0
|
||||
#define QUANT_R QUANT_R_Q8_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_q8_0
|
||||
#define A_TYPE_PACKED16 block_q8_0_packed16
|
||||
#define A_TYPE_PACKED32 block_q8_0_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q8_1 32
|
||||
#define QUANT_R_Q8_1 1
|
||||
|
||||
struct block_q8_1
|
||||
{
|
||||
f16vec2 ds;
|
||||
int8_t qs[32];
|
||||
};
|
||||
struct block_q8_1_packed16
|
||||
{
|
||||
f16vec2 ds;
|
||||
int16_t qs[16];
|
||||
};
|
||||
struct block_q8_1_packed32
|
||||
{
|
||||
f16vec2 ds;
|
||||
int32_t qs[8];
|
||||
};
|
||||
|
||||
// K-quants
|
||||
#define QUANT_K_Q2_K 256
|
||||
|
||||
|
@ -295,7 +295,10 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
|
||||
std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
|
||||
std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
|
||||
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}};
|
||||
std::map<std::string, std::string> base_dict = {
|
||||
{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"},
|
||||
{"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"},
|
||||
};
|
||||
std::string shader_name = "matmul";
|
||||
|
||||
if (matmul_id) {
|
||||
@ -313,9 +316,7 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
|
||||
base_dict["COOPMAT"] = "1";
|
||||
}
|
||||
|
||||
base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
|
||||
|
||||
std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
|
||||
const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
@ -339,14 +340,20 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
|
||||
|
||||
// don't generate f32 variants for coopmat2
|
||||
if (!coopmat2) {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (!coopmat && !coopmat2 && !matmul_id && (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "q8_0")) {
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -458,6 +465,7 @@ void process_shaders() {
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});
|
||||
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
|
@ -932,6 +932,7 @@ static void gguf_check_reserved_keys(const std::string & key, const T val) {
|
||||
if constexpr (std::is_same<T, uint32_t>::value) {
|
||||
GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
|
||||
} else {
|
||||
GGML_UNUSED(val);
|
||||
GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,15 @@ from coremltools.models.neural_network.quantization_utils import quantize_weight
|
||||
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
|
||||
from whisper import load_model
|
||||
|
||||
# Disable PyTorch Scaled Dot-Product Attention (SDPA) to avoid compatibility issues.
|
||||
# The Whisper implementation expects a specific behavior from
|
||||
# torch.nn.functional.scaled_dot_product_attention that differs between PyTorch
|
||||
# versions. Setting use_sdpa=False forces Whisper to use its manual attention
|
||||
# implementation instead, which is more stable across different PyTorch versions
|
||||
# (2.5.0 required by coremltools vs newer versions).
|
||||
import whisper.model
|
||||
whisper.model.MultiHeadAttention.use_sdpa = False
|
||||
|
||||
# Use for changing dim of input in encoder and decoder embeddings
|
||||
def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
|
||||
missing_keys, unexpected_keys, error_msgs):
|
||||
@ -260,10 +269,11 @@ def convert_decoder(hparams, model, quantize=False):
|
||||
model.eval()
|
||||
|
||||
tokens_shape = (1, 1)
|
||||
audio_shape = (1, hparams.n_audio_state, 1, 1500)
|
||||
audio_shape = (1, hparams.n_audio_ctx, hparams.n_audio_state)
|
||||
|
||||
audio_data = torch.randn(audio_shape)
|
||||
token_data = torch.randint(50257, tokens_shape).long()
|
||||
token_data = torch.randint(hparams.n_vocab, tokens_shape).long()
|
||||
|
||||
traced_model = torch.jit.trace(model, (token_data, audio_data))
|
||||
|
||||
model = ct.convert(
|
||||
|
@ -5,6 +5,8 @@
|
||||
# - src/coreml/whisper-decoder-impl.h and src/coreml/whisper-decoder-impl.m
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
wd=$(dirname "$0")
|
||||
cd "$wd/../" || exit
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
## M1 Pro
|
||||
## M1 Pro (old 22c96b4)
|
||||
|
||||
make -j && ./scripts/bench-all.sh 8
|
||||
|
||||
@ -67,202 +67,184 @@ make -j && ./scripts/bench-all.sh 8
|
||||
|
||||
Running memcpy benchmark
|
||||
|
||||
memcpy: 46.58 GB/s (heat-up)
|
||||
memcpy: 54.16 GB/s ( 1 thread)
|
||||
memcpy: 54.23 GB/s ( 1 thread)
|
||||
memcpy: 99.63 GB/s ( 2 thread)
|
||||
memcpy: 140.59 GB/s ( 3 thread)
|
||||
memcpy: 176.52 GB/s ( 4 thread)
|
||||
memcpy: 158.90 GB/s ( 5 thread)
|
||||
memcpy: 163.00 GB/s ( 6 thread)
|
||||
memcpy: 189.69 GB/s ( 7 thread)
|
||||
memcpy: 197.15 GB/s ( 8 thread)
|
||||
sum: -5120002007.000000
|
||||
memcpy: 48.01 GB/s (heat-up)
|
||||
memcpy: 56.00 GB/s ( 1 thread)
|
||||
memcpy: 56.20 GB/s ( 1 thread)
|
||||
memcpy: 102.69 GB/s ( 2 thread)
|
||||
memcpy: 140.32 GB/s ( 3 thread)
|
||||
memcpy: 179.04 GB/s ( 4 thread)
|
||||
memcpy: 159.61 GB/s ( 5 thread)
|
||||
memcpy: 159.02 GB/s ( 6 thread)
|
||||
memcpy: 180.29 GB/s ( 7 thread)
|
||||
memcpy: 198.10 GB/s ( 8 thread)
|
||||
sum: -5119999345.000000
|
||||
|
||||
|
||||
make -j && ./scripts/bench-all.sh 1
|
||||
|
||||
Running ggml_mul_mat benchmark with 1 threads
|
||||
|
||||
64 x 64: Q4_0 245.8 GFLOPS (128 runs) | Q4_1 168.6 GFLOPS (128 runs)
|
||||
64 x 64: Q5_0 115.7 GFLOPS (128 runs) | Q5_1 125.9 GFLOPS (128 runs) | Q8_0 215.8 GFLOPS (128 runs)
|
||||
64 x 64: F16 139.5 GFLOPS (128 runs) | F32 337.2 GFLOPS (128 runs)
|
||||
128 x 128: Q4_0 494.8 GFLOPS (128 runs) | Q4_1 350.4 GFLOPS (128 runs)
|
||||
128 x 128: Q5_0 257.1 GFLOPS (128 runs) | Q5_1 261.4 GFLOPS (128 runs) | Q8_0 509.4 GFLOPS (128 runs)
|
||||
128 x 128: F16 302.3 GFLOPS (128 runs) | F32 672.8 GFLOPS (128 runs)
|
||||
256 x 256: Q4_0 795.7 GFLOPS (128 runs) | Q4_1 663.7 GFLOPS (128 runs)
|
||||
256 x 256: Q5_0 737.8 GFLOPS (128 runs) | Q5_1 757.6 GFLOPS (128 runs) | Q8_0 827.7 GFLOPS (128 runs)
|
||||
256 x 256: F16 872.6 GFLOPS (128 runs) | F32 956.3 GFLOPS (128 runs)
|
||||
512 x 512: Q4_0 1188.0 GFLOPS (128 runs) | Q4_1 1085.0 GFLOPS (128 runs)
|
||||
512 x 512: Q5_0 1421.1 GFLOPS (128 runs) | Q5_1 1454.9 GFLOPS (128 runs) | Q8_0 1191.4 GFLOPS (128 runs)
|
||||
512 x 512: F16 1577.4 GFLOPS (128 runs) | F32 1982.0 GFLOPS (128 runs)
|
||||
1024 x 1024: Q4_0 2342.6 GFLOPS (128 runs) | Q4_1 1955.8 GFLOPS (128 runs)
|
||||
1024 x 1024: Q5_0 2306.7 GFLOPS (128 runs) | Q5_1 2217.0 GFLOPS (128 runs) | Q8_0 2230.7 GFLOPS (128 runs)
|
||||
1024 x 1024: F16 2593.8 GFLOPS (128 runs) | F32 3269.0 GFLOPS (128 runs)
|
||||
2048 x 2048: Q4_0 3735.7 GFLOPS (128 runs) | Q4_1 3205.3 GFLOPS (128 runs)
|
||||
2048 x 2048: Q5_0 3584.5 GFLOPS (128 runs) | Q5_1 3621.7 GFLOPS (128 runs) | Q8_0 3622.3 GFLOPS (128 runs)
|
||||
2048 x 2048: F16 3763.6 GFLOPS (128 runs) | F32 4153.3 GFLOPS (128 runs)
|
||||
4096 x 4096: Q4_0 3891.1 GFLOPS ( 29 runs) | Q4_1 3554.0 GFLOPS ( 26 runs)
|
||||
4096 x 4096: Q5_0 3753.1 GFLOPS ( 28 runs) | Q5_1 3750.1 GFLOPS ( 28 runs) | Q8_0 3768.5 GFLOPS ( 28 runs)
|
||||
4096 x 4096: F16 3864.2 GFLOPS ( 29 runs) | F32 3970.5 GFLOPS ( 29 runs)
|
||||
64 x 64: Q4_0 37.7 GFLOPS (128 runs) | Q4_1 36.0 GFLOPS (128 runs)
|
||||
64 x 64: Q5_0 20.1 GFLOPS (128 runs) | Q5_1 19.8 GFLOPS (128 runs) | Q8_0 39.5 GFLOPS (128 runs)
|
||||
64 x 64: F16 29.9 GFLOPS (128 runs) | F32 22.6 GFLOPS (128 runs)
|
||||
128 x 128: Q4_0 71.0 GFLOPS (128 runs) | Q4_1 62.2 GFLOPS (128 runs)
|
||||
128 x 128: Q5_0 33.4 GFLOPS (128 runs) | Q5_1 31.6 GFLOPS (128 runs) | Q8_0 79.8 GFLOPS (128 runs)
|
||||
128 x 128: F16 52.4 GFLOPS (128 runs) | F32 32.7 GFLOPS (128 runs)
|
||||
256 x 256: Q4_0 88.6 GFLOPS (128 runs) | Q4_1 77.2 GFLOPS (128 runs)
|
||||
256 x 256: Q5_0 40.3 GFLOPS (128 runs) | Q5_1 36.8 GFLOPS (128 runs) | Q8_0 102.5 GFLOPS (128 runs)
|
||||
256 x 256: F16 64.6 GFLOPS (128 runs) | F32 36.4 GFLOPS (128 runs)
|
||||
512 x 512: Q4_0 94.7 GFLOPS (128 runs) | Q4_1 83.6 GFLOPS (128 runs)
|
||||
512 x 512: Q5_0 45.9 GFLOPS (128 runs) | Q5_1 41.3 GFLOPS (128 runs) | Q8_0 112.8 GFLOPS (128 runs)
|
||||
512 x 512: F16 72.3 GFLOPS (128 runs) | F32 37.7 GFLOPS (128 runs)
|
||||
1024 x 1024: Q4_0 98.9 GFLOPS ( 47 runs) | Q4_1 88.2 GFLOPS ( 42 runs)
|
||||
1024 x 1024: Q5_0 49.0 GFLOPS ( 23 runs) | Q5_1 43.9 GFLOPS ( 21 runs) | Q8_0 121.0 GFLOPS ( 57 runs)
|
||||
1024 x 1024: F16 72.6 GFLOPS ( 34 runs) | F32 36.0 GFLOPS ( 17 runs)
|
||||
2048 x 2048: Q4_0 101.3 GFLOPS ( 6 runs) | Q4_1 90.0 GFLOPS ( 6 runs)
|
||||
2048 x 2048: Q5_0 50.8 GFLOPS ( 3 runs) | Q5_1 45.3 GFLOPS ( 3 runs) | Q8_0 124.1 GFLOPS ( 8 runs)
|
||||
2048 x 2048: F16 70.7 GFLOPS ( 5 runs) | F32 30.4 GFLOPS ( 3 runs)
|
||||
4096 x 4096: Q4_0 101.7 GFLOPS ( 3 runs) | Q4_1 90.3 GFLOPS ( 3 runs)
|
||||
4096 x 4096: Q5_0 52.2 GFLOPS ( 3 runs) | Q5_1 45.7 GFLOPS ( 3 runs) | Q8_0 123.0 GFLOPS ( 3 runs)
|
||||
4096 x 4096: F16 60.3 GFLOPS ( 3 runs) | F32 29.8 GFLOPS ( 3 runs)
|
||||
|
||||
|
||||
make -j && ./scripts/bench-all.sh 1 1 0
|
||||
|
||||
| CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| M2 ULTRA | METAL | tiny | 1 | 0 | 12.32 | 1.35 | 0.49 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny-q5_0 | 1 | 0 | 11.65 | 1.30 | 0.51 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny-q5_1 | 1 | 0 | 12.08 | 1.30 | 0.51 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base | 1 | 0 | 17.58 | 1.90 | 0.76 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base-q5_0 | 1 | 0 | 18.89 | 1.86 | 0.79 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base-q5_1 | 1 | 0 | 20.69 | 1.88 | 0.79 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small | 1 | 0 | 49.32 | 3.85 | 1.71 | 0.05 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small-q5_0 | 1 | 0 | 54.91 | 3.81 | 1.82 | 0.06 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small-q5_1 | 1 | 0 | 54.92 | 3.81 | 1.79 | 0.06 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium | 1 | 0 | 134.34 | 8.04 | 3.82 | 0.13 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-q5_0 | 1 | 0 | 151.68 | 7.59 | 4.07 | 0.14 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-q5_1 | 1 | 0 | 151.58 | 7.67 | 4.07 | 0.14 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-dis | 1 | 0 | 120.82 | 1.07 | 0.41 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2 | 1 | 0 | 235.63 | 12.27 | 5.85 | 0.22 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 0 | 273.38 | 11.17 | 6.40 | 0.26 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 0 | 272.44 | 11.32 | 6.29 | 0.26 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-dis | 1 | 0 | 212.51 | 1.20 | 0.47 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny | 1 | 0 | 8.74 | 1.20 | 0.36 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q5_0 | 1 | 0 | 10.30 | 1.15 | 0.38 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q5_1 | 1 | 0 | 10.71 | 1.13 | 0.38 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q8_0 | 1 | 0 | 9.97 | 1.12 | 0.37 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base | 1 | 0 | 16.77 | 1.71 | 0.44 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q5_0 | 1 | 0 | 16.92 | 1.63 | 0.44 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q5_1 | 1 | 0 | 16.84 | 1.63 | 0.44 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q8_0 | 1 | 0 | 16.12 | 1.63 | 0.44 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small | 1 | 0 | 45.29 | 3.44 | 0.92 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q5_0 | 1 | 0 | 50.43 | 3.34 | 0.94 | 0.06 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q5_1 | 1 | 0 | 50.49 | 3.35 | 0.93 | 0.06 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q8_0 | 1 | 0 | 47.37 | 3.20 | 0.91 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium | 1 | 0 | 122.81 | 7.39 | 1.99 | 0.12 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q5_0 | 1 | 0 | 140.62 | 6.73 | 2.03 | 0.14 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q5_1 | 1 | 0 | 140.44 | 6.74 | 2.04 | 0.14 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q8_0 | 1 | 0 | 131.05 | 6.54 | 1.95 | 0.13 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-dis | 1 | 0 | 110.95 | 0.99 | 0.24 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2 | 1 | 0 | 222.19 | 10.93 | 3.01 | 0.21 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 0 | 258.47 | 9.75 | 3.01 | 0.25 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 0 | 258.40 | 9.85 | 3.01 | 0.24 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 0 | 236.68 | 9.61 | 2.85 | 0.23 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-dis | 1 | 0 | 199.28 | 1.12 | 0.27 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo | 1 | 0 | 201.49 | 1.76 | 0.45 | 0.03 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 0 | 233.70 | 1.55 | 0.46 | 0.04 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 0 | 214.20 | 1.51 | 0.44 | 0.04 | ad4e350 |
|
||||
|
||||
|
||||
make -j && ./scripts/bench-all.sh 1 1 1
|
||||
|
||||
| CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| M2 ULTRA | METAL | tiny | 1 | 1 | 9.07 | 1.33 | 0.45 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny-q5_0 | 1 | 1 | 9.74 | 1.33 | 0.47 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny-q5_1 | 1 | 1 | 8.93 | 1.31 | 0.46 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base | 1 | 1 | 15.75 | 1.87 | 0.71 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base-q5_0 | 1 | 1 | 17.04 | 1.83 | 0.74 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | base-q5_1 | 1 | 1 | 17.17 | 1.83 | 0.74 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small | 1 | 1 | 42.33 | 3.64 | 1.60 | 0.05 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small-q5_0 | 1 | 1 | 47.61 | 3.63 | 1.70 | 0.05 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | small-q5_1 | 1 | 1 | 47.70 | 3.66 | 1.68 | 0.05 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium | 1 | 1 | 114.42 | 7.53 | 3.55 | 0.11 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-q5_0 | 1 | 1 | 132.63 | 7.02 | 3.77 | 0.13 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-q5_1 | 1 | 1 | 132.28 | 7.10 | 3.76 | 0.13 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | medium-dis | 1 | 1 | 102.34 | 1.01 | 0.42 | 0.01 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2 | 1 | 1 | 203.01 | 11.03 | 5.45 | 0.20 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 1 | 240.05 | 10.18 | 5.98 | 0.23 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 1 | 239.22 | 10.23 | 5.87 | 0.23 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | large-v2-dis | 1 | 1 | 181.14 | 1.14 | 0.48 | 0.02 | 22c96b4 |
|
||||
| M2 ULTRA | METAL | tiny | 1 | 1 | 7.82 | 1.31 | 0.35 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q5_0 | 1 | 1 | 8.32 | 1.28 | 0.37 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q5_1 | 1 | 1 | 8.21 | 1.28 | 0.37 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | tiny-q8_0 | 1 | 1 | 7.97 | 1.23 | 0.36 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base | 1 | 1 | 13.96 | 1.80 | 0.42 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q5_0 | 1 | 1 | 15.19 | 1.75 | 0.42 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q5_1 | 1 | 1 | 15.09 | 1.75 | 0.42 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | base-q8_0 | 1 | 1 | 14.45 | 1.70 | 0.41 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small | 1 | 1 | 40.08 | 3.54 | 0.86 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q5_0 | 1 | 1 | 45.07 | 3.51 | 0.88 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q5_1 | 1 | 1 | 45.05 | 3.52 | 0.88 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | small-q8_0 | 1 | 1 | 42.04 | 3.34 | 0.85 | 0.05 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium | 1 | 1 | 107.20 | 7.28 | 1.79 | 0.11 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q5_0 | 1 | 1 | 125.02 | 6.67 | 1.83 | 0.12 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q5_1 | 1 | 1 | 124.83 | 6.70 | 1.84 | 0.12 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-q8_0 | 1 | 1 | 114.56 | 6.53 | 1.79 | 0.11 | ad4e350 |
|
||||
| M2 ULTRA | METAL | medium-dis | 1 | 1 | 95.96 | 1.01 | 0.23 | 0.01 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2 | 1 | 1 | 194.29 | 10.57 | 2.67 | 0.20 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 1 | 230.74 | 9.57 | 2.73 | 0.23 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 1 | 229.97 | 9.69 | 2.74 | 0.23 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 1 | 208.11 | 9.37 | 2.60 | 0.21 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v2-dis | 1 | 1 | 172.72 | 1.12 | 0.26 | 0.02 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo | 1 | 1 | 174.46 | 1.74 | 0.42 | 0.03 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 1 | 205.78 | 1.54 | 0.42 | 0.04 | ad4e350 |
|
||||
| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 1 | 186.33 | 1.50 | 0.40 | 0.03 | ad4e350 |
|
||||
|
||||
|
||||
## M4 Max
|
||||
|
||||
## Ryzen 9 5950X + RTX 2060
|
||||
|
||||
make -j && ./scripts/bench-all.sh 8 0 0
|
||||
make -j && ./scripts/bench-all.sh 8
|
||||
|
||||
Running memcpy benchmark
|
||||
|
||||
memcpy: 12.36 GB/s (heat-up)
|
||||
memcpy: 12.33 GB/s ( 1 thread)
|
||||
memcpy: 12.38 GB/s ( 1 thread)
|
||||
memcpy: 14.48 GB/s ( 2 thread)
|
||||
memcpy: 15.00 GB/s ( 3 thread)
|
||||
memcpy: 14.77 GB/s ( 4 thread)
|
||||
memcpy: 14.60 GB/s ( 5 thread)
|
||||
memcpy: 14.57 GB/s ( 6 thread)
|
||||
memcpy: 14.34 GB/s ( 7 thread)
|
||||
memcpy: 14.40 GB/s ( 8 thread)
|
||||
sum: -5119998076.000000
|
||||
|
||||
Running ggml_mul_mat benchmark with 8 threads
|
||||
|
||||
64 x 64: Q4_0 3.1 GFLOPS (128 runs) | Q4_1 3.1 GFLOPS (128 runs)
|
||||
64 x 64: Q5_0 3.0 GFLOPS (128 runs) | Q5_1 2.9 GFLOPS (128 runs) | Q8_0 3.1 GFLOPS (128 runs)
|
||||
64 x 64: F16 3.0 GFLOPS (128 runs) | F32 3.0 GFLOPS (128 runs)
|
||||
128 x 128: Q4_0 21.1 GFLOPS (128 runs) | Q4_1 20.3 GFLOPS (128 runs)
|
||||
128 x 128: Q5_0 20.6 GFLOPS (128 runs) | Q5_1 20.4 GFLOPS (128 runs) | Q8_0 22.1 GFLOPS (128 runs)
|
||||
128 x 128: F16 21.7 GFLOPS (128 runs) | F32 21.7 GFLOPS (128 runs)
|
||||
256 x 256: Q4_0 105.7 GFLOPS (128 runs) | Q4_1 94.4 GFLOPS (128 runs)
|
||||
256 x 256: Q5_0 94.8 GFLOPS (128 runs) | Q5_1 87.5 GFLOPS (128 runs) | Q8_0 107.2 GFLOPS (128 runs)
|
||||
256 x 256: F16 95.1 GFLOPS (128 runs) | F32 94.3 GFLOPS (128 runs)
|
||||
512 x 512: Q4_0 214.7 GFLOPS (128 runs) | Q4_1 189.8 GFLOPS (128 runs)
|
||||
512 x 512: Q5_0 187.7 GFLOPS (128 runs) | Q5_1 176.2 GFLOPS (128 runs) | Q8_0 252.2 GFLOPS (128 runs)
|
||||
512 x 512: F16 220.8 GFLOPS (128 runs) | F32 218.3 GFLOPS (128 runs)
|
||||
1024 x 1024: Q4_0 333.7 GFLOPS (128 runs) | Q4_1 305.8 GFLOPS (128 runs)
|
||||
1024 x 1024: Q5_0 283.2 GFLOPS (128 runs) | Q5_1 268.2 GFLOPS (125 runs) | Q8_0 394.1 GFLOPS (128 runs)
|
||||
1024 x 1024: F16 355.0 GFLOPS (128 runs) | F32 313.0 GFLOPS (128 runs)
|
||||
2048 x 2048: Q4_0 395.0 GFLOPS ( 23 runs) | Q4_1 380.6 GFLOPS ( 23 runs)
|
||||
2048 x 2048: Q5_0 336.6 GFLOPS ( 20 runs) | Q5_1 318.4 GFLOPS ( 19 runs) | Q8_0 482.6 GFLOPS ( 29 runs)
|
||||
2048 x 2048: F16 424.5 GFLOPS ( 25 runs) | F32 337.7 GFLOPS ( 20 runs)
|
||||
4096 x 4096: Q4_0 412.8 GFLOPS ( 4 runs) | Q4_1 405.1 GFLOPS ( 3 runs)
|
||||
4096 x 4096: Q5_0 346.0 GFLOPS ( 3 runs) | Q5_1 334.6 GFLOPS ( 3 runs) | Q8_0 502.6 GFLOPS ( 4 runs)
|
||||
4096 x 4096: F16 412.5 GFLOPS ( 4 runs) | F32 274.0 GFLOPS ( 3 runs)
|
||||
|
||||
| CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| Ryzen 9 5950X | AVX2 | tiny | 8 | 0 | 195.29 | 1.57 | 0.51 | 0.26 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | tiny-q5_0 | 8 | 0 | 213.33 | 1.10 | 0.50 | 0.30 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | tiny-q5_1 | 8 | 0 | 219.38 | 1.18 | 0.53 | 0.32 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | base | 8 | 0 | 424.85 | 3.71 | 1.03 | 0.46 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | base-q5_0 | 8 | 0 | 473.61 | 1.81 | 0.82 | 0.52 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | base-q5_1 | 8 | 0 | 484.14 | 1.92 | 0.85 | 0.56 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | small | 8 | 0 | 1458.32 | 12.66 | 3.09 | 1.26 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | small-q5_0 | 8 | 0 | 1673.22 | 6.42 | 2.18 | 1.45 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | small-q5_1 | 8 | 0 | 1724.78 | 6.72 | 2.32 | 1.52 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | medium | 8 | 0 | 4333.87 | 36.80 | 8.56 | 3.37 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | medium-q5_0 | 8 | 0 | 5194.09 | 19.21 | 5.71 | 3.97 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | medium-q5_1 | 8 | 0 | 5450.39 | 20.01 | 5.99 | 4.17 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | medium-dis | 8 | 0 | 3995.19 | 5.08 | 1.21 | 0.55 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | large-v2 | 8 | 0 | 8056.16 | 69.74 | 16.11 | 6.13 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | large-v2-q5_0 | 8 | 0 | 9799.58 | 35.16 | 10.49 | 7.28 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | large-v2-q5_1 | 8 | 0 | ms | 36.74 | 11.02 | 7.65 | 22c96b4 |
|
||||
| Ryzen 9 5950X | AVX2 | large-v2-dis | 8 | 0 | 7490.03 | 7.40 | 1.70 | 0.72 | 22c96b4 |
|
||||
memcpy: 57.23 GB/s (heat-up)
|
||||
memcpy: 68.85 GB/s ( 1 thread)
|
||||
memcpy: 70.00 GB/s ( 1 thread)
|
||||
memcpy: 104.83 GB/s ( 2 thread)
|
||||
memcpy: 124.54 GB/s ( 3 thread)
|
||||
memcpy: 144.30 GB/s ( 4 thread)
|
||||
memcpy: 141.24 GB/s ( 5 thread)
|
||||
memcpy: 147.03 GB/s ( 6 thread)
|
||||
memcpy: 147.18 GB/s ( 7 thread)
|
||||
memcpy: 149.83 GB/s ( 8 thread)
|
||||
sum: -5120001475.000000
|
||||
|
||||
|
||||
WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 0
|
||||
make -j && ./scripts/bench-all.sh 1
|
||||
|
||||
| GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| RTX 2060 | AVX2 CUDA | tiny | 8 | 0 | 12.54 | 0.93 | 0.29 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | tiny-q5_0 | 8 | 0 | 12.73 | 0.98 | 0.24 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | tiny-q5_1 | 8 | 0 | 12.72 | 0.99 | 0.24 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base | 8 | 0 | 24.14 | 1.28 | 0.41 | 0.03 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base-q5_0 | 8 | 0 | 24.58 | 1.38 | 0.35 | 0.03 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base-q5_1 | 8 | 0 | 24.58 | 1.37 | 0.35 | 0.03 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small | 8 | 0 | 74.70 | 2.91 | 0.84 | 0.07 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small-q5_0 | 8 | 0 | 76.12 | 2.84 | 0.77 | 0.08 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small-q5_1 | 8 | 0 | 76.14 | 2.84 | 0.76 | 0.08 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium | 8 | 0 | 200.69 | 6.46 | 1.83 | 0.17 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-q5_0 | 8 | 0 | 204.80 | 5.90 | 1.65 | 0.19 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-q5_1 | 8 | 0 | 205.61 | 5.85 | 1.61 | 0.19 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-dis | 8 | 0 | 186.17 | 0.86 | 0.24 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2 | 8 | 0 | 347.22 | 10.36 | 2.82 | 0.29 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-q5_0 | 8 | 0 | 357.06 | 8.81 | 2.58 | 0.34 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-q5_1 | 8 | 0 | 356.97 | 8.62 | 2.49 | 0.33 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-dis | 8 | 0 | 318.05 | 1.03 | 0.34 | 0.04 | 22c96b4 |
|
||||
Running ggml_mul_mat benchmark with 1 threads
|
||||
|
||||
64 x 64: Q4_0 49.6 GFLOPS (128 runs) | Q4_1 46.8 GFLOPS (128 runs)
|
||||
64 x 64: Q5_0 28.1 GFLOPS (128 runs) | Q5_1 26.8 GFLOPS (128 runs) | Q8_0 52.3 GFLOPS (128 runs)
|
||||
64 x 64: F16 38.1 GFLOPS (128 runs) | F32 26.0 GFLOPS (128 runs)
|
||||
128 x 128: Q4_0 87.6 GFLOPS (128 runs) | Q4_1 79.9 GFLOPS (128 runs)
|
||||
128 x 128: Q5_0 44.7 GFLOPS (128 runs) | Q5_1 41.6 GFLOPS (128 runs) | Q8_0 98.9 GFLOPS (128 runs)
|
||||
128 x 128: F16 64.1 GFLOPS (128 runs) | F32 35.4 GFLOPS (128 runs)
|
||||
256 x 256: Q4_0 104.2 GFLOPS (128 runs) | Q4_1 92.3 GFLOPS (128 runs)
|
||||
256 x 256: Q5_0 57.3 GFLOPS (128 runs) | Q5_1 51.5 GFLOPS (128 runs) | Q8_0 127.7 GFLOPS (128 runs)
|
||||
256 x 256: F16 71.4 GFLOPS (128 runs) | F32 40.6 GFLOPS (128 runs)
|
||||
512 x 512: Q4_0 109.5 GFLOPS (128 runs) | Q4_1 98.0 GFLOPS (128 runs)
|
||||
512 x 512: Q5_0 62.4 GFLOPS (128 runs) | Q5_1 54.6 GFLOPS (128 runs) | Q8_0 135.0 GFLOPS (128 runs)
|
||||
512 x 512: F16 82.6 GFLOPS (128 runs) | F32 44.6 GFLOPS (128 runs)
|
||||
1024 x 1024: Q4_0 112.1 GFLOPS ( 53 runs) | Q4_1 100.9 GFLOPS ( 47 runs)
|
||||
1024 x 1024: Q5_0 65.4 GFLOPS ( 31 runs) | Q5_1 56.7 GFLOPS ( 27 runs) | Q8_0 140.9 GFLOPS ( 66 runs)
|
||||
1024 x 1024: F16 88.0 GFLOPS ( 41 runs) | F32 43.4 GFLOPS ( 21 runs)
|
||||
2048 x 2048: Q4_0 113.4 GFLOPS ( 7 runs) | Q4_1 102.0 GFLOPS ( 6 runs)
|
||||
2048 x 2048: Q5_0 67.1 GFLOPS ( 4 runs) | Q5_1 57.7 GFLOPS ( 4 runs) | Q8_0 142.7 GFLOPS ( 9 runs)
|
||||
2048 x 2048: F16 84.6 GFLOPS ( 5 runs) | F32 37.5 GFLOPS ( 3 runs)
|
||||
4096 x 4096: Q4_0 113.8 GFLOPS ( 3 runs) | Q4_1 102.0 GFLOPS ( 3 runs)
|
||||
4096 x 4096: Q5_0 67.7 GFLOPS ( 3 runs) | Q5_1 58.0 GFLOPS ( 3 runs) | Q8_0 142.9 GFLOPS ( 3 runs)
|
||||
4096 x 4096: F16 73.7 GFLOPS ( 3 runs) | F32 36.1 GFLOPS ( 3 runs)
|
||||
|
||||
|
||||
WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 1
|
||||
make -j && ./scripts/bench-all.sh 1 1 0
|
||||
|
||||
| GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| RTX 2060 | AVX2 CUDA | tiny | 8 | 1 | 7.21 | 0.76 | 0.29 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | tiny-q5_0 | 8 | 1 | 7.42 | 0.82 | 0.18 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | tiny-q5_1 | 8 | 1 | 7.38 | 0.82 | 0.18 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base | 8 | 1 | 13.49 | 1.04 | 0.36 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base-q5_0 | 8 | 1 | 13.94 | 1.13 | 0.26 | 0.03 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | base-q5_1 | 8 | 1 | 13.94 | 1.14 | 0.26 | 0.03 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small | 8 | 1 | 42.81 | 2.33 | 0.69 | 0.05 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small-q5_0 | 8 | 1 | 44.43 | 2.25 | 0.59 | 0.06 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | small-q5_1 | 8 | 1 | 44.11 | 2.24 | 0.58 | 0.06 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium | 8 | 1 | 115.47 | 5.17 | 1.45 | 0.11 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-q5_0 | 8 | 1 | 120.37 | 4.63 | 1.25 | 0.13 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-q5_1 | 8 | 1 | 120.28 | 4.55 | 1.21 | 0.13 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | medium-dis | 8 | 1 | 101.69 | 0.75 | 0.20 | 0.02 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2 | 8 | 1 | 205.67 | 8.49 | 2.19 | 0.18 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-q5_0 | 8 | 1 | 214.07 | 6.88 | 1.94 | 0.22 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-q5_1 | 8 | 1 | 213.98 | 6.70 | 1.86 | 0.22 | 22c96b4 |
|
||||
| RTX 2060 | AVX2 CUDA | large-v2-dis | 8 | 1 | 176.71 | 0.91 | 0.31 | 0.03 | 22c96b4 |
|
||||
| CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| M4 Max | METAL | tiny | 1 | 0 | 13.12 | 0.87 | 0.29 | 0.01 | ad4e3509 |
|
||||
| M4 Max | METAL | tiny-q8_0 | 1 | 0 | 15.90 | 0.88 | 0.31 | 0.01 | ad4e3509 |
|
||||
| M4 Max | METAL | base | 1 | 0 | 23.10 | 1.42 | 0.34 | 0.02 | ad4e3509 |
|
||||
| M4 Max | METAL | base-q8_0 | 1 | 0 | 27.25 | 1.31 | 0.34 | 0.02 | ad4e3509 |
|
||||
| M4 Max | METAL | small | 1 | 0 | 71.76 | 3.02 | 0.70 | 0.06 | ad4e3509 |
|
||||
| M4 Max | METAL | small-q8_0 | 1 | 0 | 73.88 | 2.60 | 0.71 | 0.06 | ad4e3509 |
|
||||
| M4 Max | METAL | medium | 1 | 0 | 208.22 | 6.94 | 1.55 | 0.16 | ad4e3509 |
|
||||
| M4 Max | METAL | medium-q8_0 | 1 | 0 | 214.65 | 5.90 | 1.57 | 0.17 | ad4e3509 |
|
||||
| M4 Max | METAL | large-v2 | 1 | 0 | 381.72 | 11.28 | 2.51 | 0.29 | ad4e3509 |
|
||||
| M4 Max | METAL | large-v2-q8_0 | 1 | 0 | 394.97 | 8.90 | 2.45 | 0.30 | ad4e3509 |
|
||||
|
||||
|
||||
make -j && ./scripts/bench-all.sh 1 1 1
|
||||
|
||||
| CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| M4 Max | METAL | tiny | 1 | 1 | 15.22 | 0.89 | 0.26 | 0.01 | ad4e3509 |
|
||||
| M4 Max | METAL | tiny-q8_0 | 1 | 1 | 14.70 | 0.86 | 0.26 | 0.01 | ad4e3509 |
|
||||
| M4 Max | METAL | base | 1 | 1 | 25.33 | 1.36 | 0.30 | 0.02 | ad4e3509 |
|
||||
| M4 Max | METAL | base-q8_0 | 1 | 1 | 21.27 | 1.31 | 0.30 | 0.02 | ad4e3509 |
|
||||
| M4 Max | METAL | small | 1 | 1 | 58.43 | 2.78 | 0.60 | 0.05 | ad4e3509 |
|
||||
| M4 Max | METAL | small-q8_0 | 1 | 1 | 60.26 | 2.39 | 0.60 | 0.05 | ad4e3509 |
|
||||
| M4 Max | METAL | medium | 1 | 1 | 169.73 | 6.03 | 1.31 | 0.14 | ad4e3509 |
|
||||
| M4 Max | METAL | medium-q8_0 | 1 | 1 | 176.61 | 4.99 | 1.31 | 0.14 | ad4e3509 |
|
||||
| M4 Max | METAL | large-v2 | 1 | 1 | 316.18 | 9.60 | 2.08 | 0.24 | ad4e3509 |
|
||||
| M4 Max | METAL | large-v2-q8_0 | 1 | 1 | 329.59 | 7.55 | 2.08 | 0.25 | ad4e3509 |
|
||||
|
||||
|
||||
# V100
|
||||
@ -271,28 +253,33 @@ WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 0
|
||||
|
||||
| GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| V100 | AVX2 CUDA | tiny | 1 | 0 | 6.21 | 1.11 | 0.30 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | tiny-q5_1 | 1 | 0 | 5.97 | 1.10 | 0.26 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | base | 1 | 0 | 10.95 | 1.47 | 0.42 | 0.03 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | base-q5_1 | 1 | 0 | 11.13 | 1.53 | 0.36 | 0.03 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | small | 1 | 0 | 31.57 | 2.96 | 0.84 | 0.05 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | small-q5_1 | 1 | 0 | 32.19 | 3.14 | 0.75 | 0.05 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | medium | 1 | 0 | 85.88 | 6.49 | 1.80 | 0.10 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | medium-q5_0 | 1 | 0 | 87.53 | 5.82 | 1.37 | 0.10 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | large-v2 | 1 | 0 | 142.23 | 8.92 | 2.62 | 0.15 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | tiny | 8 | 0 | 6.15 | 1.02 | 0.30 | 0.01 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 0 | 5.92 | 0.96 | 0.25 | 0.01 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | base | 8 | 0 | 10.60 | 1.43 | 0.43 | 0.02 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | base-q5_1 | 8 | 0 | 10.80 | 1.37 | 0.36 | 0.02 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | small | 8 | 0 | 31.83 | 2.82 | 0.87 | 0.04 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | small-q5_1 | 8 | 0 | 31.88 | 2.68 | 0.72 | 0.04 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | medium | 8 | 0 | 81.30 | 6.02 | 1.81 | 0.09 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | medium-q5_0 | 8 | 0 | 83.21 | 5.44 | 1.41 | 0.10 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v2 | 8 | 0 | 134.81 | 8.64 | 2.69 | 0.14 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 0 | 138.95 | 7.57 | 2.04 | 0.15 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v3-turbo | 8 | 0 | 124.42 | 1.37 | 0.43 | 0.02 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 0 | 127.81 | 1.13 | 0.32 | 0.03 | ad4e3509 |
|
||||
|
||||
|
||||
WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 1
|
||||
|
||||
| GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| V100 | AVX2 CUDA | tiny | 1 | 1 | 3.96 | 0.82 | 0.24 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | tiny-q5_1 | 1 | 1 | 4.05 | 0.85 | 0.18 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | base | 1 | 1 | 7.21 | 1.16 | 0.36 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | base-q5_1 | 1 | 1 | 7.39 | 1.21 | 0.26 | 0.02 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | small | 1 | 1 | 19.81 | 2.41 | 0.71 | 0.04 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | small-q5_1 | 1 | 1 | 20.50 | 2.31 | 0.51 | 0.04 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | medium | 1 | 1 | 56.02 | 4.89 | 1.44 | 0.07 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | medium-q5_0 | 1 | 1 | 57.85 | 4.73 | 1.09 | 0.08 | 22c96b4 |
|
||||
| V100 | AVX2 CUDA | large-v2 | 1 | 1 | 92.73 | 7.18 | 2.14 | 0.10 | 22c96b4 |
|
||||
|
||||
| V100 | AVX2 CUDA | tiny | 8 | 1 | 4.01 | 0.90 | 0.25 | 0.01 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 1 | 4.12 | 0.88 | 0.18 | 0.01 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | base | 8 | 1 | 7.00 | 1.30 | 0.35 | 0.01 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | base-q5_1 | 8 | 1 | 7.22 | 1.21 | 0.26 | 0.02 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | small | 8 | 1 | 18.68 | 2.39 | 0.69 | 0.03 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | small-q5_1 | 8 | 1 | 19.38 | 2.32 | 0.51 | 0.03 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | medium | 8 | 1 | 53.17 | 5.15 | 1.45 | 0.06 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | medium-q5_0 | 8 | 1 | 55.09 | 4.64 | 1.05 | 0.07 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v2 | 8 | 1 | 85.77 | 7.57 | 2.19 | 0.10 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 1 | 89.24 | 6.48 | 1.48 | 0.11 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v3-turbo | 8 | 1 | 75.56 | 1.25 | 0.37 | 0.02 | ad4e3509 |
|
||||
| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 1 | 78.48 | 1.01 | 0.24 | 0.02 | ad4e3509 |
|
||||
|
@ -1 +1 @@
|
||||
ba8dccd2fd53fc9cac9afdb2f1f45deedb33c1ee
|
||||
7d7aa2dee2eb55dc683af80b769b81a0642226a1
|
||||
|
@ -11,36 +11,33 @@
|
||||
|
||||
NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
|
||||
/// Model Prediction Input Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// token_data as 1 by 1 matrix of 32-bit integers
|
||||
/// token_data as 1 by 1 matrix of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * token_data;
|
||||
|
||||
/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
|
||||
/// audio_data as 1 × 1500 × 384 3-dimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Model Prediction Output Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// var_1346 as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
|
||||
/// cast_76 as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * cast_76;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
|
||||
- (instancetype)initWithCast_76:(MLMultiArray *)cast_76 NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Class for model loading and prediction
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_impl : NSObject
|
||||
@property (readonly, nonatomic, nullable) MLModel * model;
|
||||
|
||||
@ -94,7 +91,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
@ -105,7 +102,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@ -124,10 +121,25 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
*/
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make an asynchronous prediction using the standard interface
|
||||
@param input an instance of whisper_decoder_implInput to predict from
|
||||
@param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
|
||||
*/
|
||||
- (void)predictionFromFeatures:(whisper_decoder_implInput *)input completionHandler:(void (^)(whisper_decoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make an asynchronous prediction using the standard interface
|
||||
@param input an instance of whisper_decoder_implInput to predict from
|
||||
@param options prediction options
|
||||
@param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
|
||||
*/
|
||||
- (void)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(whisper_decoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make a prediction using the convenience interface
|
||||
@param token_data as 1 by 1 matrix of 32-bit integers:
|
||||
@param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
|
||||
@param token_data 1 by 1 matrix of floats
|
||||
@param audio_data 1 × 1500 × 384 3-dimensional array of floats
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_decoder_implOutput
|
||||
*/
|
||||
|
@ -39,21 +39,21 @@
|
||||
|
||||
@implementation whisper_decoder_implOutput
|
||||
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
|
||||
- (instancetype)initWithCast_76:(MLMultiArray *)cast_76 {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_var_1346 = var_1346;
|
||||
_cast_76 = cast_76;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSSet<NSString *> *)featureNames {
|
||||
return [NSSet setWithArray:@[@"var_1346"]];
|
||||
return [NSSet setWithArray:@[@"cast_76"]];
|
||||
}
|
||||
|
||||
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
||||
if ([featureName isEqualToString:@"var_1346"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
|
||||
if ([featureName isEqualToString:@"cast_76"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.cast_76];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
@ -80,10 +80,13 @@
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model {
|
||||
if (model == nil) {
|
||||
return nil;
|
||||
}
|
||||
self = [super init];
|
||||
if (!self) { return nil; }
|
||||
_model = model;
|
||||
if (_model == nil) { return nil; }
|
||||
if (self != nil) {
|
||||
_model = model;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
@ -177,7 +180,29 @@
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
||||
if (!outFeatures) { return nil; }
|
||||
return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
|
||||
return [[whisper_decoder_implOutput alloc] initWithCast_76:(MLMultiArray *)[outFeatures featureValueForName:@"cast_76"].multiArrayValue];
|
||||
}
|
||||
|
||||
- (void)predictionFromFeatures:(whisper_decoder_implInput *)input completionHandler:(void (^)(whisper_decoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler {
|
||||
[self.model predictionFromFeatures:input completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
|
||||
if (prediction != nil) {
|
||||
whisper_decoder_implOutput *output = [[whisper_decoder_implOutput alloc] initWithCast_76:(MLMultiArray *)[prediction featureValueForName:@"cast_76"].multiArrayValue];
|
||||
completionHandler(output, predictionError);
|
||||
} else {
|
||||
completionHandler(nil, predictionError);
|
||||
}
|
||||
}];
|
||||
}
|
||||
|
||||
- (void)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(whisper_decoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler {
|
||||
[self.model predictionFromFeatures:input options:options completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
|
||||
if (prediction != nil) {
|
||||
whisper_decoder_implOutput *output = [[whisper_decoder_implOutput alloc] initWithCast_76:(MLMultiArray *)[prediction featureValueForName:@"cast_76"].multiArrayValue];
|
||||
completionHandler(output, predictionError);
|
||||
} else {
|
||||
completionHandler(nil, predictionError);
|
||||
}
|
||||
}];
|
||||
}
|
||||
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
@ -192,7 +217,7 @@
|
||||
NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
||||
for (NSInteger i = 0; i < outBatch.count; i++) {
|
||||
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
||||
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
|
||||
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithCast_76:(MLMultiArray *)[resultProvider featureValueForName:@"cast_76"].multiArrayValue];
|
||||
[results addObject:result];
|
||||
}
|
||||
return results;
|
||||
|
@ -11,9 +11,8 @@
|
||||
|
||||
NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
|
||||
/// Model Prediction Input Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
|
||||
@ -23,9 +22,8 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Model Prediction Output Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// output as multidimensional array of floats
|
||||
@ -35,9 +33,8 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Class for model loading and prediction
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_impl : NSObject
|
||||
@property (readonly, nonatomic, nullable) MLModel * model;
|
||||
|
||||
@ -91,7 +88,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
@ -102,7 +99,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@ -121,9 +118,24 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
*/
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make an asynchronous prediction using the standard interface
|
||||
@param input an instance of whisper_encoder_implInput to predict from
|
||||
@param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
|
||||
*/
|
||||
- (void)predictionFromFeatures:(whisper_encoder_implInput *)input completionHandler:(void (^)(whisper_encoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make an asynchronous prediction using the standard interface
|
||||
@param input an instance of whisper_encoder_implInput to predict from
|
||||
@param options prediction options
|
||||
@param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
|
||||
*/
|
||||
- (void)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(whisper_encoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make a prediction using the convenience interface
|
||||
@param logmel_data as 1 × n_mel × 3000 3-dimensional array of floats:
|
||||
@param logmel_data 1 × 80 × 3000 3-dimensional array of floats
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_encoder_implOutput
|
||||
*/
|
||||
|
@ -76,10 +76,13 @@
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model {
|
||||
if (model == nil) {
|
||||
return nil;
|
||||
}
|
||||
self = [super init];
|
||||
if (!self) { return nil; }
|
||||
_model = model;
|
||||
if (_model == nil) { return nil; }
|
||||
if (self != nil) {
|
||||
_model = model;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
@ -176,6 +179,28 @@
|
||||
return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
|
||||
}
|
||||
|
||||
- (void)predictionFromFeatures:(whisper_encoder_implInput *)input completionHandler:(void (^)(whisper_encoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler {
|
||||
[self.model predictionFromFeatures:input completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
|
||||
if (prediction != nil) {
|
||||
whisper_encoder_implOutput *output = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
|
||||
completionHandler(output, predictionError);
|
||||
} else {
|
||||
completionHandler(nil, predictionError);
|
||||
}
|
||||
}];
|
||||
}
|
||||
|
||||
- (void)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(whisper_encoder_implOutput * _Nullable output, NSError * _Nullable error))completionHandler {
|
||||
[self.model predictionFromFeatures:input options:options completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
|
||||
if (prediction != nil) {
|
||||
whisper_encoder_implOutput *output = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
|
||||
completionHandler(output, predictionError);
|
||||
} else {
|
||||
completionHandler(nil, predictionError);
|
||||
}
|
||||
}];
|
||||
}
|
||||
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
|
||||
return [self predictionFromFeatures:input_ error:error];
|
||||
|
@ -12,74 +12,74 @@ if (EMSCRIPTEN)
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(TEST_TARGET test-main-tiny)
|
||||
set(TEST_TARGET test-whisper-cli-tiny)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.bin -l fr
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;gh")
|
||||
|
||||
set(TEST_TARGET test-main-tiny.en)
|
||||
set(TEST_TARGET test-whisper-cli-tiny.en)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;en;gh")
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;en")
|
||||
|
||||
set(TEST_TARGET test-main-base)
|
||||
set(TEST_TARGET test-whisper-cli-base)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.bin -l fr
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base")
|
||||
|
||||
set(TEST_TARGET test-main-base.en)
|
||||
set(TEST_TARGET test-whisper-cli-base.en)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.en.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base;en")
|
||||
|
||||
set(TEST_TARGET test-main-small)
|
||||
set(TEST_TARGET test-whisper-cli-small)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.bin -l fr
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small")
|
||||
|
||||
set(TEST_TARGET test-main-small.en)
|
||||
set(TEST_TARGET test-whisper-cli-small.en)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.en.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small;en")
|
||||
|
||||
set(TEST_TARGET test-main-medium)
|
||||
set(TEST_TARGET test-whisper-cli-medium)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.bin -l fr
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium")
|
||||
|
||||
set(TEST_TARGET test-main-medium.en)
|
||||
set(TEST_TARGET test-whisper-cli-medium.en)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.en.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium;en")
|
||||
|
||||
set(TEST_TARGET test-main-large)
|
||||
set(TEST_TARGET test-whisper-cli-large)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
|
||||
|
||||
if (WHISPER_FFMPEG)
|
||||
set(TEST_TARGET test-main-tiny-mp3)
|
||||
set(TEST_TARGET test-whisper-cli-tiny-mp3)
|
||||
# Check with reviewers: any way to check the output transcription via ctest (diff, ...)?
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND $<TARGET_FILE:main>
|
||||
COMMAND $<TARGET_FILE:whisper-cli>
|
||||
-m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin
|
||||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.mp3)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;mp3")
|
||||
|
Reference in New Issue
Block a user