From d61d55cd4b9fe77511c8eea28d0220ce552f7008 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Jan 2023 16:11:41 +0200 Subject: [PATCH] ggml : speed-up soft max via Accelerate + unroll --- ggml.c | 166 +++++++++++++++++++++++++++++++++++----------------- whisper.cpp | 2 +- 2 files changed, 114 insertions(+), 54 deletions(-) diff --git a/ggml.c b/ggml.c index 38e79e76..cfc63704 100644 --- a/ggml.c +++ b/ggml.c @@ -81,6 +81,7 @@ typedef void* thread_ret_t; #define GGML_DEBUG 0 #define GGML_GELU_FP16 +#define GGML_SOFT_MAX_UNROLL 4 #if UINTPTR_MAX == 0xFFFFFFFF #define GGML_MEM_ALIGN 4 @@ -310,6 +311,7 @@ int64_t ggml_cycles_per_ms(void) { return CLOCKS_PER_SEC/1000; } +//#define GGML_PERF #ifdef GGML_PERF #define ggml_perf_time_ms() ggml_time_ms() #define ggml_perf_time_us() ggml_time_us() @@ -1316,25 +1318,25 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; } -bool ggml_is_scalar(const struct ggml_tensor * tensor) { +static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -bool ggml_is_vector(const struct ggml_tensor * tensor) { +static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -bool ggml_is_matrix(const struct ggml_tensor * tensor) { +static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } -bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -1343,7 +1345,7 @@ bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * (t0->ne[3] == t1->ne[3]); } -bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -1353,7 +1355,7 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { +static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -1362,7 +1364,7 @@ bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -1373,7 +1375,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor } // check if t1 can be represented as a repeatition of t0 -bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -1383,14 +1385,20 @@ bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t (t1->ne[3]%t0->ne[3] == 0); } -int ggml_up32(int n) { +static inline int ggml_up32(int n) { return (n + 31) & ~31; } -int ggml_up64(int n) { +static inline int ggml_up64(int n) { return (n + 63) & ~63; } +static inline int ggml_up(int n, int m) { + // assert m is a power of 2 + GGML_ASSERT((m & (m - 1)) == 0); + return (n + m - 1) & ~(m - 1); +} + // assert that pointer is aligned to GGML_MEM_ALIGN #define ggml_assert_aligned(ptr) \ assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) @@ -5094,21 +5102,19 @@ static void ggml_compute_forward_soft_max_f32( #endif float max = -INFINITY; - for (int i = 0; i < nc; i++) { - max = MAX(max, p[i]); - } + ggml_vec_max_f32(nc, &max, p); ggml_float sum = 0.0; - uint16_t ss; + uint16_t scvt; for (int i = 0; i < nc; i++) { if (p[i] == -INFINITY) { - p[i] = 0.0; + p[i] = 0.0f; } else { //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); - memcpy(&ss, &s, sizeof(ss)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); sum += val; p[i] = val; } @@ -5820,6 +5826,8 @@ static void ggml_compute_forward_flash_attn_f32( const int P = nek1 - N; const int M = P + N; + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + GGML_ASSERT(ne0 == D); GGML_ASSERT(ne1 == N); GGML_ASSERT(P >= 0); @@ -5872,7 +5880,11 @@ static void ggml_compute_forward_flash_attn_f32( const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - float * S = (float *) params->wdata + ith*(M + CACHE_LINE_SIZE_F32); + float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } for (int ic = 0; ic < nek1; ++ic) { // k indices @@ -5903,30 +5915,50 @@ static void ggml_compute_forward_flash_attn_f32( // softmax { float max = -INFINITY; - for (int i = 0; i < M; i++) { - max = MAX(max, S[i]); - } + ggml_vec_max_f32(M, &max, S); - ggml_float sum = 0.0; + float sum = 0.0f; + { +#ifndef GGML_USE_ACCELERATE + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - uint16_t ss; - for (int i = 0; i < M; i++) { - if (S[i] == -INFINITY) { - S[i] = 0.0; - } else { - //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); - memcpy(&ss, &s, sizeof(ss)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); - sum += val; - S[i] = val; + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = S + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += val; + SS[j] = val; + } + } } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#else + vvexpf(S, S, &Mup); + ggml_vec_sum_f32(Mup, &sum, S); +#endif } assert(sum > 0.0f); sum = 1.0/sum; ggml_vec_scale_f32(M, S, sum); + +#ifndef NDEBUG + for (int i = 0; i < M; ++i) { + assert(!isnan(S[i])); + assert(!isinf(S[i])); + } +#endif } for (int ic = 0; ic < nev1; ++ic) { @@ -6001,6 +6033,8 @@ static void ggml_compute_forward_flash_attn_f16( const int P = nek1 - N; const int M = P + N; + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + GGML_ASSERT(ne0 == D); GGML_ASSERT(ne1 == N); GGML_ASSERT(P >= 0); @@ -6053,7 +6087,11 @@ static void ggml_compute_forward_flash_attn_f16( const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); + float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } for (int ic = 0; ic < nek1; ++ic) { // k indices @@ -6084,30 +6122,50 @@ static void ggml_compute_forward_flash_attn_f16( // softmax { float max = -INFINITY; - for (int i = 0; i < M; i++) { - max = MAX(max, S[i]); - } + ggml_vec_max_f32(M, &max, S); - ggml_float sum = 0.0; + float sum = 0.0f; + { +#ifndef GGML_USE_ACCELERATE + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - uint16_t ss; - for (int i = 0; i < M; i++) { - if (S[i] == -INFINITY) { - S[i] = 0.0; - } else { - //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); - memcpy(&ss, &s, sizeof(ss)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); - sum += val; - S[i] = val; + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = S + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += val; + SS[j] = val; + } + } } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#else + vvexpf(S, S, &Mup); + ggml_vec_sum_f32(Mup, &sum, S); +#endif } assert(sum > 0.0f); sum = 1.0/sum; ggml_vec_scale_f32(M, S, sum); + +#ifndef NDEBUG + for (int i = 0; i < M; ++i) { + assert(!isnan(S[i])); + assert(!isinf(S[i])); + } +#endif } ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); @@ -7188,14 +7246,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; + const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); diff --git a/whisper.cpp b/whisper.cpp index e8d9f0c9..8de4a157 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -131,7 +131,7 @@ static const std::map> g_lang = { { "su", { 98, "sundanese", } }, }; -static const size_t MB = 1024*1024; +static const size_t MB = 3*1024*1024; static const std::map MEM_REQ_MODEL = { { MODEL_TINY, 74ull*MB },