From d7a9346ab1bb6de2fb5235c48aae0829b2739210 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 2 Apr 2025 15:22:13 +0800 Subject: [PATCH] get_rows and dup optimization (llama/12671) * [CANN]get_rows and dup optimization. Co-authored-by: hipudding Signed-off-by: noemotiovon * [CANN]GET_ROWS and CPY/DUP optimization Co-authored-by: hipudding Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon --------- Signed-off-by: noemotiovon Co-authored-by: noemotiovon Co-authored-by: hipudding --- ggml/src/ggml-cann/CMakeLists.txt | 2 - ggml/src/ggml-cann/aclnn_ops.cpp | 479 +++++++++++++++--------------- ggml/src/ggml-cann/ggml-cann.cpp | 26 +- 3 files changed, 262 insertions(+), 245 deletions(-) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 05cf06bf..0d8e483b 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR) ${CANN_INSTALL_DIR}/acllib/include ) - add_subdirectory(kernels) list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler - ascendc_kernels ) file(GLOB GGML_SOURCES_CANN "*.cpp") diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 6bb5d083..8482bb53 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,6 @@ #include #include "ggml-impl.h" -#include "kernels/ascendc_kernels.h" #define GGML_COMMON_DECL_C @@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclDestroyIntArray(repeats)); } +/** + * @brief Casts the elements of a tensor to a specified data type using the CANN backend. + * + * @details This function performs a type conversion on the elements of the input tensor `acl_src` + * and stores the results in the destination tensor `acl_dst`. The conversion type is + * determined based on the `dst` tensor's data type. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be cast. + * @param acl_dst The destination tensor that will store the casted elements. + * @param dst The ggml tensor specifying the target data type. + */ +static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, ggml_tensor* dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, + ggml_cann_type_mapping(dst->type), + acl_dst, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; GGML_ASSERT(ggml_can_repeat(src, dst)); @@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, } void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; + ggml_tensor* src0 = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_src = ggml_cann_create_tensor(src0); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - - ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - src->extra = src_extra_allocator.get(); - dst->extra = dst_extra_allocator.get(); - ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - - if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) && - ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - // TODO: simplify - if (src->type == GGML_TYPE_F16) { - if (dst->type == GGML_TYPE_Q8_0) { - aclrtlaunch_ascendc_quantize_f16_q8_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_Q4_0) { - aclrtlaunch_ascendc_quantize_f16_to_q4_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - - aclrtlaunch_ascendc_dup_by_rows_fp16( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } - GGML_ABORT("fatal error"); - } - if (dst->type == GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } - GGML_ABORT("fatal error"); - } - // TODO - GGML_ABORT("fatal error"); - } else if (src->type == GGML_TYPE_F32) { - // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size - // && nb0 == type_size) - if (dst->type == GGML_TYPE_Q8_0) { - aclrtlaunch_ascendc_quantize_f32_q8_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_Q4_0) { - aclrtlaunch_ascendc_quantize_f32_to_q4_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp32( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } else { - // TODO: dst not contiguous - GGML_ABORT("fatal error"); - } - } - if (dst->type == GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } - } - // TODO - GGML_ABORT("fatal error"); - } else { - if (ggml_are_same_shape(src, dst)) { + if (ggml_are_same_shape(src0, dst)) { + if (dst->type == src0->type) { cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; + } else { + aclnn_cast(ctx, acl_src, acl_dst, dst); + } + } else { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (dst->type == src0->type) { + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync( + dst->data, cpy_size, src0->data, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + return; + } else { + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), + ggml_nelements(dst) * ggml_type_size(dst->type)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), src0->ne, src_trans_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_src, src_trans_tensor, dst); + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync( + dst->data, cpy_size, src_trans_buffer, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); + return; + } + } else if (ggml_is_contiguous(dst)) { + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), src0->ne, src_trans_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_src, src_trans_tensor, dst); + + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer, + cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, + ctx.stream())); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); + return; + } else { + GGML_ABORT("Unsupport dst is not tontiguous."); } - GGML_ABORT("fatal error"); } + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); } #ifdef __cplusplus @@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(tmp_mask_tensor)); } -void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; +/** + * @brief Performs embedding operation on a 4D tensor using the CANN backend. + * + * This function extracts slices from the source tensor (`src_buffer`), + * index tensor (`index`), and destination tensor (`dst`), and performs an + * embedding operation on them. The embedding operation is applied by iterating + * over the last two dimensions of the source tensor, creating the necessary + * tensors for the source, index, and output, and executing the embedding operation. + * + * @param ctx The context for CANN backend operations. + * @param src_buffer The source buffer holding the data for the source tensor. + * @param src_ne The dimensions of the source tensor. + * @param src_nb The strides (byte offsets) of the source tensor. + * @param index The index tensor used in the embedding operation. + * @param dst The destination tensor where the result will be stored. + */ +static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer, + int64_t* src_ne, size_t* src_nb, ggml_tensor* index, + ggml_tensor* dst) { + for (int64_t i = 0; i < src_ne[3]; i++) { + for (int64_t j = 0; j < src_ne[2]; j++) { + // src + int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]}; + size_t acl_src_nb[2] = {src_nb[0], src_nb[1]}; + aclTensor* acl_src_tensor = ggml_cann_create_tensor( + (char*)src_buffer + i * src_nb[3] + j * src_nb[2], + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), + acl_src_ne, acl_src_nb, 2); - ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - src0->extra = src0_extra_allocator.get(); - src1->extra = src1_extra_allocator.get(); - dst->extra = dst_extra_allocator.get(); - ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); + // index + int64_t acl_index_ne[1] = {index->ne[0]}; + size_t acl_index_nb[1] = {index->nb[0]}; + aclTensor* acl_index = ggml_cann_create_tensor( + (char*)index->data + i * index->nb[2] + j * index->nb[1], + ggml_cann_type_mapping(index->type), ggml_element_size(index), + acl_index_ne, acl_index_nb, 1); + + // out + int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]}; + size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]}; + aclTensor* acl_out = ggml_cann_create_tensor( + (char*)dst->data + i * dst->nb[3] + j * dst->nb[2], + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), + acl_out_ne, acl_out_nb, 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnEmbeddingGetWorkspaceSize( + acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), + workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor, + ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src_tensor)); + ACL_CHECK(aclDestroyTensor(acl_index)); + ACL_CHECK(aclDestroyTensor(acl_out)); + } + } +} + +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // src + ggml_tensor* src1 = dst->src[1]; // index switch (src0->type) { case GGML_TYPE_F32: { -#ifdef ASCEND_310P - // Special operation for get_row_f32 kernel of 310P: clear the - // content of dest data buffer when row is not aligned to 32 bytes - if ((src0->ne[0] % 8) != 0) { - size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * - src0->ne[0] * ggml_type_size(GGML_TYPE_F32); - ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); - } -#endif - aclrtlaunch_ascendc_get_row_f32( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src0->extra)->nb, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1, + dst); break; } case GGML_TYPE_F16: { -#ifdef ASCEND_310P - // Special operation for get_row_f16 kernel of 310P: clear the - // content of dest data buffer when row is not aligned to 32 bytes - if ((src0->ne[0] % 16) != 0) { - size_t dst_len = - src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * - ggml_type_size( - GGML_TYPE_F32); // out is also f32, even input is f16 - ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; } -#endif - aclrtlaunch_ascendc_get_row_f16( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src0->extra)->nb, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type), + src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src0, src_trans_tensor, dst); + aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne, + src_trans_nb, src1, dst); + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); break; } - case GGML_TYPE_Q4_0: - aclrtlaunch_ascendc_get_row_q4_0( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - break; - case GGML_TYPE_Q8_0: - aclrtlaunch_ascendc_get_row_q8_0( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + case GGML_TYPE_Q8_0: { + // add 1 dim for bcast mul. + size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], + dequant_nb[GGML_MAX_DIMS + 1]; + int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], + *dequant_ne; + int64_t scale_offset = 0; + + // [3,4,5,64] -> [3,4,5,2,32] + weight_ne[0] = QK8_0; + weight_ne[1] = src0->ne[0] / QK8_0; + weight_nb[0] = sizeof(int8_t); + weight_nb[1] = weight_nb[0] * weight_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + weight_ne[i] = src0->ne[i - 1]; + weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1]; + } + + // [3,4,5,64] -> [3,4,5,2,1] + scale_ne[0] = 1; + scale_ne[1] = src0->ne[0] / QK8_0; + scale_nb[0] = sizeof(uint16_t); + scale_nb[1] = scale_nb[0] * scale_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + scale_ne[i] = src0->ne[i - 1]; + scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1]; + } + + // [3,4,5,64] -> [3,4,5,2,32] + dequant_ne = weight_ne; + dequant_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { + dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; + } + + scale_offset = ggml_nelements(src0) * sizeof(int8_t); + ggml_cann_pool_alloc dequant_buffer_allocator( + ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); + + aclTensor* acl_weight_tensor = ggml_cann_create_tensor( + src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, + GGML_MAX_DIMS + 1); + aclTensor* acl_scale_tensor = ggml_cann_create_tensor( + src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb, + GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); + aclTensor* dequant_tensor = ggml_cann_create_tensor( + dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t), + dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); + + aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); + dequant_nb[0] = sizeof(float_t); + dequant_ne = src0->ne; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; + } + + aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(), + dequant_ne, dequant_nb, src1, dst); + + ACL_CHECK(aclDestroyTensor(dequant_tensor)); break; + } default: - GGML_ABORT("fatal error"); + GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS"); break; } } @@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, - &workspaceSize, &executor)); + nullptr, nullptr, nullptr, antiquantGroupSize, + acl_output_tensor, &workspaceSize, &executor)); if (workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize); } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 68cd9920..da75f77f 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, switch (op->src[0]->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; default: @@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } } break; case GGML_OP_CPY: { - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - return true; - default: - return false; + ggml_tensor *src = op->src[0]; + if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) || + (src->type != GGML_TYPE_F32 && + src->type != GGML_TYPE_F16)) { + // only support F32 and F16. + return false; } - } + + if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) { + // unsupport dst is not contiguous. + return false; + } + + return true; + } break; case GGML_OP_CONT: { // TODO: support GGML_TYPE_BF16 switch (op->src[0]->type) { @@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } return true; } + case GGML_OP_DUP: case GGML_OP_IM2COL: case GGML_OP_CONCAT: - case GGML_OP_DUP: case GGML_OP_REPEAT: case GGML_OP_NONE: case GGML_OP_RESHAPE: