CANN: Support Ascend310P to accelerate F32 and F16 Model (llama/10216)

* CANN Support Ascend310P to accelerate F32 and F16 Model

* Add compile option soc type macro ASCEND_310P to ggml-cann lib

* Remove unused code

* Remove the ascend soc_type hard code compile option in CMakelist.txt
This commit is contained in:
leo-pony 2024-11-22 14:07:20 +08:00 committed by Georgi Gerganov
parent 2a4b5c9d7e
commit 4b81335f75
7 changed files with 123 additions and 41 deletions

View File

@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif() endif()
# Auto-detech Soc type and Soc version, if detect failed, will abort build
set(SOC_VERSION "")
function(detect_ascend_soc_type SOC_VERSION)
execute_process(
COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
OUTPUT_VARIABLE npu_info
RESULT_VARIABLE npu_result
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if("${npu_info}" STREQUAL "" OR ${npu_result})
message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
endif()
set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
endfunction()
if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
else()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif()
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
if (CANN_INSTALL_DIR) if (CANN_INSTALL_DIR)
# Only Support Linux. # Only Support Linux.
if (NOT UNIX) if (NOT UNIX)
@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
else() else()

View File

@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{
#ifdef ASCEND_310P
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 8) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
aclrtlaunch_ascendc_get_row_f32( aclrtlaunch_ascendc_get_row_f32(
24, ctx.stream(), src0->data, src1->data, dst->data, 24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne, ((ggml_tensor*)src0->extra)->ne,
@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
break; break;
}
case GGML_TYPE_F16: case GGML_TYPE_F16:
{
#ifdef ASCEND_310P
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 16) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
aclrtlaunch_ascendc_get_row_f16( aclrtlaunch_ascendc_get_row_f16(
24, ctx.stream(), src0->data, src1->data, dst->data, 24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne, ((ggml_tensor*)src0->extra)->ne,
@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
break; break;
}
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
aclrtlaunch_ascendc_get_row_q4_0( aclrtlaunch_ascendc_get_row_q4_0(
24, ctx.stream(), src0->data, src1->data, dst->data, 24, ctx.stream(), src0->data, src1->data, dst->data,

View File

@ -1,7 +1,3 @@
if (NOT SOC_TYPE)
set (SOC_TYPE "Ascend910B3")
endif()
file(GLOB SRC_FILES file(GLOB SRC_FILES
get_row_f32.cpp get_row_f32.cpp
get_row_f16.cpp get_row_f16.cpp
@ -13,7 +9,6 @@ file(GLOB SRC_FILES
dup.cpp dup.cpp
) )
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES} ${SRC_FILES}
) )
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

View File

@ -5,6 +5,7 @@
using namespace AscendC; using namespace AscendC;
#define BUFFER_NUM 2 #define BUFFER_NUM 2
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
template <typename SRC_T, typename DST_T> template <typename SRC_T, typename DST_T>
class DupByRows { class DupByRows {
@ -19,6 +20,7 @@ class DupByRows {
// Input has four dims. // Input has four dims.
int64_t op_block_num = GetBlockNum(); int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx(); int64_t op_block_idx = GetBlockIdx();
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
// param // param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@ -51,24 +53,36 @@ class DupByRows {
__aicore__ inline void copy_in() { __aicore__ inline void copy_in() {
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>(); LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
const size_t elem_per_block = 32 / sizeof(SRC_T);
DataCopyExtParams dataCopyParams; size_t tail = num_elem % elem_per_block;
dataCopyParams.blockCount = 1; size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
dataCopyParams.blockLen = num_elem * sizeof(SRC_T); DataCopy(src_local, src_gm, cpy_elements_len);
DataCopyPadExtParams<SRC_T> padParams;
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
src_queue.EnQue(src_local); src_queue.EnQue(src_local);
} }
__aicore__ inline void copy_out() { __aicore__ inline void copy_out() {
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>(); LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
#ifdef ASCEND_310P
const size_t elem_per_block = 32 / sizeof(DST_T);
size_t tail = num_elem % elem_per_block;
size_t len = num_elem & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(dst_gm, dst_local, len);
}
if(tail != 0) {
for (size_t i = tail; i < elem_per_block; i++) {
dst_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
SetAtomicNone();
}
#else
DataCopyExtParams dataCopyParams; DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1; dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(DST_T); dataCopyParams.blockLen = num_elem * sizeof(DST_T);
DataCopyPad(dst_gm, dst_local, dataCopyParams); DataCopyPad(dst_gm, dst_local, dataCopyParams);
#endif
dst_queue.FreeTensor(dst_local); dst_queue.FreeTensor(dst_local);
} }

View File

@ -14,7 +14,7 @@ class GET_ROW_F16 {
int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t *output_ne_ub, size_t *output_nb_ub) {
// TODO, use template for F16/f32 // TODO, use template for F16/f32
int64_t op_block_num = GetBlockNum(); int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx(); op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i]; input_ne[i] = input_ne_ub[i];
@ -59,32 +59,42 @@ class GET_ROW_F16 {
} }
__aicore__ inline void copy_in(uint32_t offset, size_t len) { __aicore__ inline void copy_in(uint32_t offset, size_t len) {
size_t origin_len = len;
LocalTensor<half> input_local = input_queue.AllocTensor<half>(); LocalTensor<half> input_local = input_queue.AllocTensor<half>();
size_t tail = len % 32; const size_t elem_per_block = 32 / sizeof(half);
len = len & ~31; size_t tail = len % elem_per_block;
DataCopy(input_local, input_gm[offset], len); len = len & ~(elem_per_block - 1);
if(tail != 0) { if(tail != 0) {
DataCopyExtParams dataCopyParams; len += elem_per_block;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(half);
DataCopyPadExtParams<half> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
} }
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local); input_queue.EnQue(input_local);
} }
__aicore__ inline void copy_out(uint32_t offset, size_t len) { __aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>(); LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32; const size_t elem_per_block = 32 / sizeof(float);
len = len & ~31; size_t tail = len % elem_per_block;
DataCopy(output_gm[offset], output_local, len); len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
if(tail != 0) { if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams; DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1; dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float); dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len], DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams); dataCopyParams);
#endif
} }
output_queue.FreeTensor(output_local); output_queue.FreeTensor(output_local);
} }
@ -150,6 +160,7 @@ class GET_ROW_F16 {
GlobalTensor<float> output_gm; GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue; TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue; TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
}; };
template <typename T> template <typename T>

View File

@ -13,7 +13,7 @@ class GET_ROW_F32 {
int64_t *indices_ne_ub, size_t *indices_nb_ub, int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t *output_ne_ub, size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum(); int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx(); op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i]; input_ne[i] = input_ne_ub[i];
@ -55,31 +55,40 @@ class GET_ROW_F32 {
__aicore__ inline void copy_in(uint32_t offset, size_t len) { __aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>(); LocalTensor<float> input_local = input_queue.AllocTensor<float>();
size_t tail = len % 32; const size_t elem_per_block = 32 / sizeof(float);
len = len & ~31; size_t tail = len % elem_per_block;
DataCopy(input_local, input_gm[offset], len); len = len & ~(elem_per_block - 1);
if(tail != 0) { if(tail != 0) {
DataCopyExtParams dataCopyParams; len += elem_per_block;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPadExtParams<float> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
} }
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local); input_queue.EnQue(input_local);
} }
__aicore__ inline void copy_out(uint32_t offset, size_t len) { __aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>(); LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32; const size_t elem_per_block = 32 / sizeof(float);
len = len & ~31; size_t tail = len % elem_per_block;
DataCopy(output_gm[offset], output_local, len); len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
if(tail != 0) { if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams; DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1; dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float); dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len], DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams); dataCopyParams);
#endif
} }
output_queue.FreeTensor(output_local); output_queue.FreeTensor(output_local);
} }
@ -144,6 +153,7 @@ class GET_ROW_F32 {
GlobalTensor<float> output_gm; GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue; TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue; TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
}; };
template <typename T> template <typename T>

View File

@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
LocalTensor<float> output_local = output_queue.AllocTensor<float>(); LocalTensor<float> output_local = output_queue.AllocTensor<float>();
// TODO: cast more data to speed up. // TODO: cast more data to speed up.
#ifdef ASCEND_310P
// TODO: 310P support quantification
#else
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
#endif
// Only mul need compile by group. // Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset); half scale = scale_gm.GetValue(scale_offset);