From 81c999fe0a25c4ebbfef10ed8a1a96df9cfc10fd Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 9 Aug 2024 20:21:56 +0800 Subject: [PATCH] cann : add Ascend NPU support (#2336) * enable Ascend NPU in src/whisper.cpp * sync test-backend-ops with llama.cpp --- ggml/src/ggml-cann/Doxyfile | 4 +- src/whisper.cpp | 22 ++++++- tests/test-backend-ops.cpp | 123 +++++++++++++++++++++++++++++++----- 3 files changed, 128 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile index 2b009e8f..5b7595b5 100644 --- a/ggml/src/ggml-cann/Doxyfile +++ b/ggml/src/ggml-cann/Doxyfile @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "llama.cpp" +PROJECT_NAME = "whisper.cpp" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -44,7 +44,7 @@ PROJECT_NUMBER = # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "llama inference engine" +PROJECT_BRIEF = "Port of OpenAI's Whisper model in C/C++" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 diff --git a/src/whisper.cpp b/src/whisper.cpp index 8d836908..0744dba1 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -29,6 +29,10 @@ #include "openvino/whisper-openvino-encoder.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -1283,6 +1287,16 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa } #endif +#ifdef GGML_USE_CANN + if (params.use_gpu) { + WHISPER_LOG_INFO("%s: using CANN backend\n", __func__); + result = ggml_backend_cann_init(params.gpu_device); + if (!result) { + WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__); + } + } +#endif + return result; } @@ -1335,6 +1349,10 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont result || (result = ggml_backend_vk_buffer_type(params.gpu_device)); #endif +#ifdef GGML_USE_CANN + result || (result == ggml_backend_cann_buffer_type(params.gpu_device)); +#endif + result || (result = ggml_backend_cpu_buffer_type()); return result; @@ -4337,8 +4355,8 @@ const char * whisper_print_system_info(void) { s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; - s += "OPENVINO = " + std::to_string(whisper_has_openvino()) ; - + s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | "; + s += "CANN = " + std::to_string(ggml_cpu_has_cann()) ; return s.c_str(); } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1ed74e54..2f4117a6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include @@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m im = nullptr; } } + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); + // TODO: other cases + //#pragma omp parallel for + //for (int i = 0; i < tensor->ne[1]; i++) { + // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), + // i * tensor->ne[0], 1, tensor->ne[0], im); + //} + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -125,7 +132,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -760,7 +767,7 @@ struct test_dup : public test_case { } test_dup(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne = {10, 10, 20, 1}, std::array permute = {0, 0, 0, 0}) : type(type), ne(ne), permute(permute), _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} @@ -780,9 +787,11 @@ struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; const std::array ne; + const std::array permute; + bool _src_use_permute; std::string vars() override { - return VARS_TO_STR3(type_src, type_dst, ne); + return VARS_TO_STR4(type_src, type_dst, ne, permute); } double max_nmse_err() override { @@ -794,12 +803,17 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}) - : type_src(type_src), type_dst(type_dst), ne(ne) {} + std::array ne = {10, 10, 10, 1}, + std::array permute = {0, 0, 0, 0}) + : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), + _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data()); + if (_src_use_permute) { + src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + } + ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); ggml_tensor * out = ggml_cpy(ctx, src, dst); return out; } @@ -1175,6 +1189,7 @@ struct test_soft_max : public test_case { } }; + // GGML_OP_ROPE struct test_rope : public test_case { const ggml_type type; @@ -1267,6 +1282,32 @@ struct test_pool2d : public test_case { } }; +// GGML_OP_CONV_TRANSPOSE_1D +struct test_conv_transpose_1d : public test_case { + const std::array ne_input; + const std::array ne_kernel; + + const int s0; // stride + const int p0; // padding + const int d0; // dilation + + std::string vars() override { + return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0); + } + + test_conv_transpose_1d(std::array ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1] + std::array ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1] + int s0 = 1, int p0 = 0, int d0 = 1) + : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0); + return out; + } +}; + // GGML_OP_IM2COL struct test_im2col : public test_case { const ggml_type type_input; @@ -1280,7 +1321,7 @@ struct test_im2col : public test_case { // padding const int p0; const int p1; - // dilatation + // dilation const int d0; const int d1; // mode @@ -1393,7 +1434,7 @@ struct test_argsort : public test_case { ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -1470,6 +1511,7 @@ struct test_group_norm : public test_case { const ggml_type type; const std::array ne; const int32_t num_groups; + const float eps; std::string vars() override { return VARS_TO_STR3(type, ne, num_groups); @@ -1477,12 +1519,13 @@ struct test_group_norm : public test_case { test_group_norm(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 64, 320, 1}, - int32_t num_groups = 32) - : type(type), ne(ne), num_groups(num_groups) {} + int32_t num_groups = 32, + float eps = 1e-6f) + : type(type), ne(ne), num_groups(num_groups), eps(eps) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_tensor * out = ggml_group_norm(ctx, a, num_groups); + ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps); return out; } }; @@ -2053,6 +2096,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + GGML_TYPE_BF16, }; // unary ops @@ -2097,6 +2141,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); + // test cases for 1D im2col + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + + test_cases.emplace_back(new test_conv_transpose_1d()); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); + test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1})); @@ -2110,12 +2167,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_dup(GGML_TYPE_F16)); test_cases.emplace_back(new test_dup(GGML_TYPE_I32)); test_cases.emplace_back(new test_dup(GGML_TYPE_I16)); + test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows + test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + } + } + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { + for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous } } @@ -2165,6 +2232,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); } +#if 1 for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); @@ -2184,10 +2252,31 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2})); } } +#else + // m = a rows + // n = b rows + // k = cols + std::uniform_int_distribution<> dist_m(1, 128); + std::uniform_int_distribution<> dist_n(16, 128); + std::uniform_int_distribution<> dist_k(1, 16); + for (int i = 0; i < 1000; i++) { + for (ggml_type type_a : all_types) { + for (ggml_type type_b : {GGML_TYPE_F32}) { + int m = dist_m(rng); + int n = dist_n(rng); + int k = dist_k(rng) * ggml_blck_size(type_a); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1})); + } + } + } +#endif for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); + if (ggml_blck_size(type_a) != 256) { + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1})); + } + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1})); } } @@ -2247,7 +2336,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (int n = 0; n < 10; ++n) { int64_t ne0 = dist_ne0(rng); int64_t ne1 = dist_ne1(rng); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f)); } exponent <<= 1; @@ -2266,7 +2355,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } } - + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); @@ -2380,7 +2469,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op return true; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return false; }