diff --git a/Makefile b/Makefile index 605d8d48..7aaade97 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ ifndef NVCC_VERSION endif endif -CCV := $(shell $(CC) --version | head -n 1) +CCV := $(shell $(CC) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 diff --git a/ggml-alloc.c b/ggml-alloc.c index 856a4cdb..78977a64 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -6,6 +6,26 @@ #include #include +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include +#endif + + #define UNUSED(x) (void)(x) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define GGML_MAX_CONCUR (2*GGML_MAX_NODES) @@ -99,15 +119,28 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens } #endif - -static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(alloc); } +// check if a tensor is allocated by this buffer +static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { + void * ptr = tensor->data; + return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; +} + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->view_src != NULL; +} + void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - size_t size = ggml_allocator_get_alloc_size(alloc, tensor); +#ifdef GGML_ALLOCATOR_DEBUG + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -131,14 +164,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; - max_avail = MAX(max_avail, block->size); } else { fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", __func__, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); - return; + return; } } struct free_block * block = &alloc->free_blocks[best_fit_block]; @@ -173,17 +206,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void * ptr = tensor->data; - if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { + if (ggml_allocr_is_own(alloc, tensor) == false) { // the tensor was not allocated in this buffer // this can happen because the graph allocator will try to free weights and other tensors from different buffers // the easiest way to deal with this is just to ignore it return; } - size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); @@ -277,17 +310,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) return alloc; } -// address and size of the buffer when measuring -// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers -static void * const MEASURE_BASE_ADDR = (void *) 0x1000; -static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB +// OS specific functions to allocate and free uncommitted virtual memory +static void * alloc_vmem(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); +#elif defined(_POSIX_MAPPED_FILES) + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; +#else + // use a fixed address for other platforms + uintptr_t base_addr = (uintptr_t)-size - 0x100; + return (void *)base_addr; +#endif +} + +static void free_vmem(void * base_addr, size_t size) { +#if defined(_WIN32) + VirtualFree(base_addr, 0, MEM_RELEASE); + UNUSED(size); +#elif defined(_POSIX_MAPPED_FILES) + munmap(base_addr, size); +#else + // nothing to do + UNUSED(base_addr); + UNUSED(size); +#endif +} + +// allocate uncommitted virtual memory to measure the size of the graph +static void alloc_measure_vmem(void ** base_addr, size_t * size) { + // 1TB for 64-bit, 1GB for 32-bit + *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40; + do { + *base_addr = alloc_vmem(*size); + if (*base_addr != NULL) { + AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); + return; + } + // try again with half the size + *size /= 2; + } while (*size > 0); + + GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); +} + +static void free_measure_vmem(void * base_addr, size_t size) { + free_vmem(base_addr, size); +} struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + void * base_addr; + size_t size; + + alloc_measure_vmem(&base_addr, &size); + *alloc = (struct ggml_allocr){ - /*.data = */ MEASURE_BASE_ADDR, - /*.size = */ MEASURE_MAX_SIZE, + /*.data = */ base_addr, + /*.size = */ size, /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, @@ -307,6 +391,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { } void ggml_allocr_free(struct ggml_allocr * alloc) { + if (alloc->measure) { + free_measure_vmem(alloc->data, alloc->size); + } free(alloc); } @@ -316,11 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { //////////// compute graph allocator -static bool ggml_is_view(struct ggml_tensor * t) { - return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || - t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; -} - static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { if (a->type != b->type) { return false; @@ -336,28 +418,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml return true; } -static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { - switch (t->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return t->src[0]; - case GGML_OP_CPY: - return t->src[1]; - default: - return NULL; - } -} - -static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { - struct ggml_tensor * parent = t; - do { - parent = get_view_parent(parent); - } while (ggml_is_view(parent)); - return parent; -} - static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { case GGML_OP_SCALE: @@ -365,7 +425,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_DIAG_MASK_INF: case GGML_OP_ADD: case GGML_OP_ADD1: - case GGML_OP_ACC: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: @@ -375,10 +434,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_UNARY: case GGML_OP_ROPE: case GGML_OP_RMS_NORM: - case GGML_OP_SET: case GGML_OP_SOFT_MAX: case GGML_OP_CONT: - case GGML_OP_ADD_REL_POS: return true; default: @@ -390,24 +447,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { - size_t offset; - switch(node->op) { - case GGML_OP_VIEW: - memcpy(&offset, node->op_params, sizeof(size_t)); - node->data = (char *) node->src[0]->data + offset; - break; - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - node->data = node->src[0]->data; - break; - case GGML_OP_CPY: - node->data = node->src[1]->data; - break; - default: - GGML_ASSERT(!"unknown view op"); - break; - } + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -418,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) } // if the node's data is external, then we cannot re-use it - if ((char *) parent->data < (char *) alloc->data || - (char *) parent->data >= ((char *) alloc->data + alloc->size)) { + if (ggml_allocr_is_own(alloc, parent) == false) { AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); continue; } @@ -427,7 +467,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) struct hash_node * p_hn = hash_get(ht, parent); if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); + struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = hash_get(ht, view_src); if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite @@ -453,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) } } -static size_t ggml_allocator_alloc_graph_tensors_n( +static size_t ggml_allocr_alloc_graph_tensors_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -469,7 +509,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_tensor * node = gf->nodes[i]; if (ggml_is_view(node)) { - struct ggml_tensor * view_src = get_view_source(node); + struct ggml_tensor * view_src = node->view_src; hash_get(ht, view_src)->n_views += 1; } @@ -531,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n( AT_PRINTF("\n"); } - // update parents // update immediately if there is no parse_seq // update only at barriers if there is parse_seq - if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) { + if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; int update_end = alloc->parse_seq_len ? ind : ind + 1; for (int i = update_start; i < update_end; i++) { @@ -554,17 +593,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n( if (p_hn->n_children == 0 && p_hn->n_views == 0) { if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); + struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = hash_get(ht, view_src); view_src_hn->n_views -= 1; AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { - ggml_allocator_free_tensor(alloc, view_src); + ggml_allocr_free_tensor(alloc, view_src); } } else { if (parent->data != node->data) { - ggml_allocator_free_tensor(alloc, parent); + ggml_allocr_free_tensor(alloc, parent); } } } @@ -581,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( for (int i = 0; outputs[g][i] != NULL; i++) { struct ggml_tensor * output = outputs[g][i]; AT_PRINTF("output: %s\n", output->name); - ggml_allocator_free_tensor(alloc, output); + ggml_allocr_free_tensor(alloc, output); } } } @@ -590,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n( } size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { - return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); + return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } diff --git a/ggml.c b/ggml.c index 3f72379c..f8cd58df 100644 --- a/ggml.c +++ b/ggml.c @@ -18337,10 +18337,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", i, node->ne[0], node->ne[1], - ggml_op_name(node->op)); + ggml_op_name(node->op), + ggml_get_name(node)); } for (int i = 0; i < GGML_OP_COUNT; i++) { diff --git a/whisper.cpp b/whisper.cpp index 6ceea676..52cc1c97 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1410,8 +1410,6 @@ static struct ggml_cgraph * whisper_build_graph_encoder( const int n_mels = hparams.n_mels; - assert(mel_inp.n_mel == n_mels); - struct ggml_init_params params = { /*.mem_size =*/ wstate.buf_compute.size(), /*.mem_buffer =*/ wstate.buf_compute.data(), @@ -1429,6 +1427,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder( assert(mel->type == GGML_TYPE_F32); if (!ggml_allocr_is_measure(alloc)) { + assert(mel_inp.n_mel == n_mels); + float * dst = (float *) mel->data; memset(dst, 0, ggml_nbytes(mel)); @@ -1442,6 +1442,15 @@ static struct ggml_cgraph * whisper_build_graph_encoder( } } + ggml_build_forward_expand(gf, mel); + + struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, KQscale); + + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head)); + } + struct ggml_tensor * cur; #ifndef WHISPER_USE_COREML @@ -1533,14 +1542,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder( Qcur), Qcur); - //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); // note: no bias for Key struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k_w, cur); - //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v_w, @@ -1597,13 +1606,9 @@ static struct ggml_cgraph * whisper_build_graph_encoder( // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) - ); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale); - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled); struct ggml_tensor * V = ggml_cpy(ctx0, @@ -1698,28 +1703,34 @@ static struct ggml_cgraph * whisper_build_graph_encoder( cur), ggml_repeat(ctx0, model.e_ln_b, cur)); } - - ggml_build_forward_expand (gf, cur); } #ifdef WHISPER_USE_COREML else if (use_coreml) { cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + ggml_allocr_alloc(alloc, cur); - whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); + if (!ggml_allocr_is_measure(alloc)) { + whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); + } } #endif #ifdef WHISPER_USE_OPENVINO else if (use_openvino) { cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + ggml_allocr_alloc(alloc, cur); - if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) { - return false; + if (!ggml_allocr_is_measure(alloc)) { + whisper_openvino_encode(wstate.ctx_openvino, mel, cur); } } #endif + ggml_build_forward_expand(gf, cur); + wstate.embd_enc = cur; + //ggml_graph_print(gf); + //////////////////////////////////////////////////////////////////////////// //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__, @@ -1755,14 +1766,16 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post( ggml_cgraph * gf = ggml_new_graph(ctx0); - //ggml_allocr * alloc = wstate.alloc_encode_post; + ggml_allocr * alloc = wstate.alloc_encode_post; - struct ggml_tensor * cur = wstate.embd_enc; + struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc); - // TODO: hack to disconnect the encoded features from the previous graph - cur->op = GGML_OP_NONE; - cur->src[0] = nullptr; - cur->src[1] = nullptr; + struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, Kscale); + + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25)); + } for (int il = 0; il < model.hparams.n_text_layer; ++il) { auto & layer = model.layers_decoder[il]; @@ -1771,7 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post( layer.cross_attn_k_w, cur); - Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25))); + Kcross = ggml_scale(ctx0, Kcross, Kscale); struct ggml_tensor* Vcross = ggml_mul_mat(ctx0, layer.cross_attn_v_w, @@ -1794,6 +1807,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v)); } + //ggml_graph_print(gf); + ggml_free(ctx0); return gf; @@ -1826,7 +1841,26 @@ static bool whisper_encode_internal( ggml_allocr_alloc_graph(alloc, gf); +#ifdef WHISPER_USE_COREML +#else ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); +#endif + + //auto cur = wstate.embd_enc; + ////auto cur = gf->leafs[0]; + //printf("cur name = '%s'\n", cur->name); + + //float * res = (float *) cur->data; + //for (int i = 0; i < 10; ++i) { + // printf("%f ", res[i]); + //} + //printf("\n"); + //double sum = 0.0; + //for (int i = 0; i < ggml_nelements(cur); ++i) { + // sum += res[i]; + //} + //printf("sum: %f\n", sum); + //printf("n: %d\n", ggml_nelements(cur)); } // encoder_post @@ -1840,6 +1874,21 @@ static bool whisper_encode_internal( ggml_allocr_alloc_graph(alloc, gf); ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); + + //auto cur = gf->nodes[gf->n_nodes - 1]; + //printf("cur name = '%s'\n", cur->name); + + //ggml_fp16_t * res = (ggml_fp16_t *) cur->data; + //for (int i = 0; i < 10; ++i) { + // printf("%f ", ggml_fp32_to_fp16(res[i])); + //} + //printf("\n"); + //double sum = 0.0; + //for (int i = 0; i < ggml_nelements(cur); ++i) { + // sum += ggml_fp32_to_fp16(res[i]); + //} + //printf("sum: %f\n", sum); + //printf("n: %d\n", ggml_nelements(cur)); } // ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); @@ -1902,6 +1951,13 @@ static struct ggml_cgraph * whisper_build_graph_decoder( } } + struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, KQscale); + + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25)); + } + // token encoding + position encoding struct ggml_tensor * cur = ggml_add(ctx0, @@ -1937,14 +1993,14 @@ static struct ggml_cgraph * whisper_build_graph_decoder( Qcur), Qcur); - Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Qcur = ggml_scale(ctx0, Qcur, KQscale); // note: no bias for Key struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k_w, cur); - Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Kcur = ggml_scale(ctx0, Kcur, KQscale); // store key and value to memory { @@ -1988,15 +2044,11 @@ static struct ggml_cgraph * whisper_build_graph_decoder( // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - //struct ggml_tensor * KQ_scaled = - // ggml_scale_inplace(ctx0, - // KQ, - // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) - // ); + //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past); - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -2052,7 +2104,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder( Qcur), Qcur); - Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Qcur = ggml_scale(ctx0, Qcur, KQscale); // Kcross is already scaled struct ggml_tensor * Kcross = @@ -2092,15 +2144,15 @@ static struct ggml_cgraph * whisper_build_graph_decoder( struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //struct ggml_tensor * KQ_scaled = - // ggml_scale_inplace(ctx0, + // ggml_scale(ctx0, // KQ, // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) // ); // no masking for cross-attention - //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); @@ -2225,7 +2277,7 @@ static bool whisper_decode_internal( // decoder { - auto & alloc = wstate.alloc_encode; + auto & alloc = wstate.alloc_decode; ggml_allocr_reset(alloc); @@ -2758,8 +2810,9 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { { const auto & hparams = ctx->model.hparams; - const int n_tokens = hparams.n_text_ctx/2; - const int n_past = hparams.n_text_ctx/2; // TODO: double-check + // TODO: make sure this is the worst-case scenario + const int n_tokens = hparams.n_text_ctx; + const int n_past = 0; ggml_cgraph * gf = whisper_build_graph_decoder(*ctx, *state, state->decoders[0], NULL, n_tokens, n_past);