whisper : CoreML support ggml-alloc

This commit is contained in:
Georgi Gerganov 2023-09-10 21:55:20 +03:00
parent af6f67b251
commit fa672b46e6
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 208 additions and 115 deletions

View File

@ -6,6 +6,26 @@
#include <stdlib.h>
#include <string.h>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <memoryapi.h>
#endif
#define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@ -99,15 +119,28 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
}
#endif
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
return ggml_nbytes(tensor);
UNUSED(alloc);
}
// check if a tensor is allocated by this buffer
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
void * ptr = tensor->data;
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
}
static bool ggml_is_view(struct ggml_tensor * t) {
return t->view_src != NULL;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
#ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -131,9 +164,9 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail);
@ -173,17 +206,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
void * ptr = tensor->data;
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
if (ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it
return;
}
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@ -277,17 +310,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
return alloc;
}
// address and size of the buffer when measuring
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
// OS specific functions to allocate and free uncommitted virtual memory
static void * alloc_vmem(size_t size) {
#if defined(_WIN32)
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) {
return NULL;
}
return ptr;
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
return (void *)base_addr;
#endif
}
static void free_vmem(void * base_addr, size_t size) {
#if defined(_WIN32)
VirtualFree(base_addr, 0, MEM_RELEASE);
UNUSED(size);
#elif defined(_POSIX_MAPPED_FILES)
munmap(base_addr, size);
#else
// nothing to do
UNUSED(base_addr);
UNUSED(size);
#endif
}
// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
return;
}
// try again with half the size
*size /= 2;
} while (*size > 0);
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
}
static void free_measure_vmem(void * base_addr, size_t size) {
free_vmem(base_addr, size);
}
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
void * base_addr;
size_t size;
alloc_measure_vmem(&base_addr, &size);
*alloc = (struct ggml_allocr){
/*.data = */ MEASURE_BASE_ADDR,
/*.size = */ MEASURE_MAX_SIZE,
/*.data = */ base_addr,
/*.size = */ size,
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
@ -307,6 +391,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
}
void ggml_allocr_free(struct ggml_allocr * alloc) {
if (alloc->measure) {
free_measure_vmem(alloc->data, alloc->size);
}
free(alloc);
}
@ -316,11 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
//////////// compute graph allocator
static bool ggml_is_view(struct ggml_tensor * t) {
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
}
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
if (a->type != b->type) {
return false;
@ -336,28 +418,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
return true;
}
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
switch (t->op) {
case GGML_OP_PERMUTE:
case GGML_OP_RESHAPE:
case GGML_OP_TRANSPOSE:
case GGML_OP_VIEW:
return t->src[0];
case GGML_OP_CPY:
return t->src[1];
default:
return NULL;
}
}
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
struct ggml_tensor * parent = t;
do {
parent = get_view_parent(parent);
} while (ggml_is_view(parent));
return parent;
}
static bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
case GGML_OP_SCALE:
@ -365,7 +425,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_ACC:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
@ -375,10 +434,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_UNARY:
case GGML_OP_ROPE:
case GGML_OP_RMS_NORM:
case GGML_OP_SET:
case GGML_OP_SOFT_MAX:
case GGML_OP_CONT:
case GGML_OP_ADD_REL_POS:
return true;
default:
@ -390,24 +447,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (ggml_is_view(node)) {
size_t offset;
switch(node->op) {
case GGML_OP_VIEW:
memcpy(&offset, node->op_params, sizeof(size_t));
node->data = (char *) node->src[0]->data + offset;
break;
case GGML_OP_PERMUTE:
case GGML_OP_RESHAPE:
case GGML_OP_TRANSPOSE:
node->data = node->src[0]->data;
break;
case GGML_OP_CPY:
node->data = node->src[1]->data;
break;
default:
GGML_ASSERT(!"unknown view op");
break;
}
assert(node->view_src->data != NULL);
node->data = (char *)node->view_src->data + node->view_offs;
} else {
// see if we can reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) {
@ -418,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
}
// if the node's data is external, then we cannot re-use it
if ((char *) parent->data < (char *) alloc->data ||
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
if (ggml_allocr_is_own(alloc, parent) == false) {
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
continue;
}
@ -427,7 +467,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
struct hash_node * p_hn = hash_get(ht, parent);
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent);
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = hash_get(ht, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@ -453,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
}
}
static size_t ggml_allocator_alloc_graph_tensors_n(
static size_t ggml_allocr_alloc_graph_tensors_n(
struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -469,7 +509,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
struct ggml_tensor * node = gf->nodes[i];
if (ggml_is_view(node)) {
struct ggml_tensor * view_src = get_view_source(node);
struct ggml_tensor * view_src = node->view_src;
hash_get(ht, view_src)->n_views += 1;
}
@ -531,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
AT_PRINTF("\n");
}
// update parents
// update immediately if there is no parse_seq
// update only at barriers if there is parse_seq
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
int update_end = alloc->parse_seq_len ? ind : ind + 1;
for (int i = update_start; i < update_end; i++) {
@ -554,17 +593,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent);
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = hash_get(ht, view_src);
view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src);
ggml_allocr_free_tensor(alloc, view_src);
}
}
else {
if (parent->data != node->data) {
ggml_allocator_free_tensor(alloc, parent);
ggml_allocr_free_tensor(alloc, parent);
}
}
}
@ -581,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
for (int i = 0; outputs[g][i] != NULL; i++) {
struct ggml_tensor * output = outputs[g][i];
AT_PRINTF("output: %s\n", output->name);
ggml_allocator_free_tensor(alloc, output);
ggml_allocr_free_tensor(alloc, output);
}
}
}
@ -590,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
}
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
}

5
ggml.c
View File

@ -18337,10 +18337,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * node = cgraph->leafs[i];
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
i,
node->ne[0], node->ne[1],
ggml_op_name(node->op));
ggml_op_name(node->op),
ggml_get_name(node));
}
for (int i = 0; i < GGML_OP_COUNT; i++) {

View File

@ -1410,8 +1410,6 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels);
struct ggml_init_params params = {
/*.mem_size =*/ wstate.buf_compute.size(),
/*.mem_buffer =*/ wstate.buf_compute.data(),
@ -1429,6 +1427,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
assert(mel->type == GGML_TYPE_F32);
if (!ggml_allocr_is_measure(alloc)) {
assert(mel_inp.n_mel == n_mels);
float * dst = (float *) mel->data;
memset(dst, 0, ggml_nbytes(mel));
@ -1442,6 +1442,15 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
}
}
ggml_build_forward_expand(gf, mel);
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, KQscale);
if (!ggml_allocr_is_measure(alloc)) {
ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head));
}
struct ggml_tensor * cur;
#ifndef WHISPER_USE_COREML
@ -1533,14 +1542,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
Qcur),
Qcur);
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
@ -1597,13 +1606,9 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
);
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
struct ggml_tensor * V =
ggml_cpy(ctx0,
@ -1698,28 +1703,34 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
cur),
ggml_repeat(ctx0, model.e_ln_b, cur));
}
ggml_build_forward_expand (gf, cur);
}
#ifdef WHISPER_USE_COREML
else if (use_coreml) {
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
ggml_allocr_alloc(alloc, cur);
if (!ggml_allocr_is_measure(alloc)) {
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
}
}
#endif
#ifdef WHISPER_USE_OPENVINO
else if (use_openvino) {
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
ggml_allocr_alloc(alloc, cur);
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
return false;
if (!ggml_allocr_is_measure(alloc)) {
whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
}
}
#endif
ggml_build_forward_expand(gf, cur);
wstate.embd_enc = cur;
//ggml_graph_print(gf);
////////////////////////////////////////////////////////////////////////////
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
@ -1755,14 +1766,16 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
ggml_cgraph * gf = ggml_new_graph(ctx0);
//ggml_allocr * alloc = wstate.alloc_encode_post;
ggml_allocr * alloc = wstate.alloc_encode_post;
struct ggml_tensor * cur = wstate.embd_enc;
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
// TODO: hack to disconnect the encoded features from the previous graph
cur->op = GGML_OP_NONE;
cur->src[0] = nullptr;
cur->src[1] = nullptr;
struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, Kscale);
if (!ggml_allocr_is_measure(alloc)) {
ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25));
}
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
auto & layer = model.layers_decoder[il];
@ -1771,7 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
layer.cross_attn_k_w,
cur);
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
Kcross = ggml_scale(ctx0, Kcross, Kscale);
struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
layer.cross_attn_v_w,
@ -1794,6 +1807,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v));
}
//ggml_graph_print(gf);
ggml_free(ctx0);
return gf;
@ -1826,7 +1841,26 @@ static bool whisper_encode_internal(
ggml_allocr_alloc_graph(alloc, gf);
#ifdef WHISPER_USE_COREML
#else
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
#endif
//auto cur = wstate.embd_enc;
////auto cur = gf->leafs[0];
//printf("cur name = '%s'\n", cur->name);
//float * res = (float *) cur->data;
//for (int i = 0; i < 10; ++i) {
// printf("%f ", res[i]);
//}
//printf("\n");
//double sum = 0.0;
//for (int i = 0; i < ggml_nelements(cur); ++i) {
// sum += res[i];
//}
//printf("sum: %f\n", sum);
//printf("n: %d\n", ggml_nelements(cur));
}
// encoder_post
@ -1840,6 +1874,21 @@ static bool whisper_encode_internal(
ggml_allocr_alloc_graph(alloc, gf);
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
//auto cur = gf->nodes[gf->n_nodes - 1];
//printf("cur name = '%s'\n", cur->name);
//ggml_fp16_t * res = (ggml_fp16_t *) cur->data;
//for (int i = 0; i < 10; ++i) {
// printf("%f ", ggml_fp32_to_fp16(res[i]));
//}
//printf("\n");
//double sum = 0.0;
//for (int i = 0; i < ggml_nelements(cur); ++i) {
// sum += ggml_fp32_to_fp16(res[i]);
//}
//printf("sum: %f\n", sum);
//printf("n: %d\n", ggml_nelements(cur));
}
// ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
@ -1902,6 +1951,13 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
}
}
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, KQscale);
if (!ggml_allocr_is_measure(alloc)) {
ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25));
}
// token encoding + position encoding
struct ggml_tensor * cur =
ggml_add(ctx0,
@ -1937,14 +1993,14 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
Qcur),
Qcur);
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale(ctx0, Qcur, KQscale);
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Kcur = ggml_scale(ctx0, Kcur, KQscale);
// store key and value to memory
{
@ -1988,15 +2044,11 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
//struct ggml_tensor * KQ_scaled =
// ggml_scale_inplace(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );
//struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
@ -2052,7 +2104,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
Qcur),
Qcur);
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale(ctx0, Qcur, KQscale);
// Kcross is already scaled
struct ggml_tensor * Kcross =
@ -2092,15 +2144,15 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
//struct ggml_tensor * KQ_scaled =
// ggml_scale_inplace(ctx0,
// ggml_scale(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );
// no masking for cross-attention
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
@ -2225,7 +2277,7 @@ static bool whisper_decode_internal(
// decoder
{
auto & alloc = wstate.alloc_encode;
auto & alloc = wstate.alloc_decode;
ggml_allocr_reset(alloc);
@ -2758,8 +2810,9 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
{
const auto & hparams = ctx->model.hparams;
const int n_tokens = hparams.n_text_ctx/2;
const int n_past = hparams.n_text_ctx/2; // TODO: double-check
// TODO: make sure this is the worst-case scenario
const int n_tokens = hparams.n_text_ctx;
const int n_past = 0;
ggml_cgraph * gf = whisper_build_graph_decoder(*ctx, *state, state->decoders[0], NULL, n_tokens, n_past);