mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-27 14:49:55 +00:00
whisper : CoreML support ggml-alloc
This commit is contained in:
parent
af6f67b251
commit
fa672b46e6
185
ggml-alloc.c
185
ggml-alloc.c
@ -6,6 +6,26 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <memoryapi.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
||||
@ -99,15 +119,28 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(alloc);
|
||||
}
|
||||
|
||||
// check if a tensor is allocated by this buffer
|
||||
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
||||
}
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||
#endif
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
@ -131,9 +164,9 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||
if (best_fit_block == -1) {
|
||||
// the last block is our last resort
|
||||
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size) {
|
||||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
} else {
|
||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||
__func__, size, max_avail);
|
||||
@ -173,17 +206,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
|
||||
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
|
||||
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
||||
// the tensor was not allocated in this buffer
|
||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||
// the easiest way to deal with this is just to ignore it
|
||||
return;
|
||||
}
|
||||
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
||||
|
||||
@ -277,17 +310,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||
return alloc;
|
||||
}
|
||||
|
||||
// address and size of the buffer when measuring
|
||||
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
||||
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||
// OS specific functions to allocate and free uncommitted virtual memory
|
||||
static void * alloc_vmem(size_t size) {
|
||||
#if defined(_WIN32)
|
||||
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||
if (ptr == MAP_FAILED) {
|
||||
return NULL;
|
||||
}
|
||||
return ptr;
|
||||
#else
|
||||
// use a fixed address for other platforms
|
||||
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
||||
return (void *)base_addr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void free_vmem(void * base_addr, size_t size) {
|
||||
#if defined(_WIN32)
|
||||
VirtualFree(base_addr, 0, MEM_RELEASE);
|
||||
UNUSED(size);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
munmap(base_addr, size);
|
||||
#else
|
||||
// nothing to do
|
||||
UNUSED(base_addr);
|
||||
UNUSED(size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// allocate uncommitted virtual memory to measure the size of the graph
|
||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
||||
// 1TB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
||||
do {
|
||||
*base_addr = alloc_vmem(*size);
|
||||
if (*base_addr != NULL) {
|
||||
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
||||
return;
|
||||
}
|
||||
// try again with half the size
|
||||
*size /= 2;
|
||||
} while (*size > 0);
|
||||
|
||||
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
||||
}
|
||||
|
||||
static void free_measure_vmem(void * base_addr, size_t size) {
|
||||
free_vmem(base_addr, size);
|
||||
}
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
void * base_addr;
|
||||
size_t size;
|
||||
|
||||
alloc_measure_vmem(&base_addr, &size);
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ MEASURE_BASE_ADDR,
|
||||
/*.size = */ MEASURE_MAX_SIZE,
|
||||
/*.data = */ base_addr,
|
||||
/*.size = */ size,
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
@ -307,6 +391,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
}
|
||||
|
||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||
if (alloc->measure) {
|
||||
free_measure_vmem(alloc->data, alloc->size);
|
||||
}
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
@ -316,11 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||
|
||||
//////////// compute graph allocator
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
@ -336,28 +418,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_VIEW:
|
||||
return t->src[0];
|
||||
case GGML_OP_CPY:
|
||||
return t->src[1];
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
||||
struct ggml_tensor * parent = t;
|
||||
do {
|
||||
parent = get_view_parent(parent);
|
||||
} while (ggml_is_view(parent));
|
||||
return parent;
|
||||
}
|
||||
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
@ -365,7 +425,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
@ -375,10 +434,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_ADD_REL_POS:
|
||||
return true;
|
||||
|
||||
default:
|
||||
@ -390,24 +447,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
size_t offset;
|
||||
switch(node->op) {
|
||||
case GGML_OP_VIEW:
|
||||
memcpy(&offset, node->op_params, sizeof(size_t));
|
||||
node->data = (char *) node->src[0]->data + offset;
|
||||
break;
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
node->data = node->src[0]->data;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
node->data = node->src[1]->data;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(!"unknown view op");
|
||||
break;
|
||||
}
|
||||
assert(node->view_src->data != NULL);
|
||||
node->data = (char *)node->view_src->data + node->view_offs;
|
||||
} else {
|
||||
// see if we can reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
@ -418,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
}
|
||||
|
||||
// if the node's data is external, then we cannot re-use it
|
||||
if ((char *) parent->data < (char *) alloc->data ||
|
||||
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
||||
if (ggml_allocr_is_own(alloc, parent) == false) {
|
||||
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
||||
continue;
|
||||
}
|
||||
@ -427,7 +467,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct ggml_tensor * view_src = parent->view_src;
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
@ -453,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
@ -469,7 +509,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * view_src = get_view_source(node);
|
||||
struct ggml_tensor * view_src = node->view_src;
|
||||
hash_get(ht, view_src)->n_views += 1;
|
||||
}
|
||||
|
||||
@ -531,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
|
||||
|
||||
// update parents
|
||||
// update immediately if there is no parse_seq
|
||||
// update only at barriers if there is parse_seq
|
||||
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
||||
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
||||
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
||||
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
||||
for (int i = update_start; i < update_end; i++) {
|
||||
@ -554,17 +593,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct ggml_tensor * view_src = parent->view_src;
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
ggml_allocr_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (parent->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, parent);
|
||||
ggml_allocr_free_tensor(alloc, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -581,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
for (int i = 0; outputs[g][i] != NULL; i++) {
|
||||
struct ggml_tensor * output = outputs[g][i];
|
||||
AT_PRINTF("output: %s\n", output->name);
|
||||
ggml_allocator_free_tensor(alloc, output);
|
||||
ggml_allocr_free_tensor(alloc, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -590,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
}
|
||||
|
||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
}
|
||||
|
5
ggml.c
5
ggml.c
@ -18337,10 +18337,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
struct ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1],
|
||||
ggml_op_name(node->op));
|
||||
ggml_op_name(node->op),
|
||||
ggml_get_name(node));
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||
|
127
whisper.cpp
127
whisper.cpp
@ -1410,8 +1410,6 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
|
||||
const int n_mels = hparams.n_mels;
|
||||
|
||||
assert(mel_inp.n_mel == n_mels);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ wstate.buf_compute.size(),
|
||||
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
||||
@ -1429,6 +1427,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
|
||||
assert(mel->type == GGML_TYPE_F32);
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
assert(mel_inp.n_mel == n_mels);
|
||||
|
||||
float * dst = (float *) mel->data;
|
||||
memset(dst, 0, ggml_nbytes(mel));
|
||||
|
||||
@ -1442,6 +1442,15 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
}
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, mel);
|
||||
|
||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(alloc, KQscale);
|
||||
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head));
|
||||
}
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
#ifndef WHISPER_USE_COREML
|
||||
@ -1533,14 +1542,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
Qcur),
|
||||
Qcur);
|
||||
|
||||
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
|
||||
// note: no bias for Key
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_k_w,
|
||||
cur);
|
||||
|
||||
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_v_w,
|
||||
@ -1597,13 +1606,9 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale_inplace(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
||||
);
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_cpy(ctx0,
|
||||
@ -1698,28 +1703,34 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.e_ln_b, cur));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand (gf, cur);
|
||||
}
|
||||
#ifdef WHISPER_USE_COREML
|
||||
else if (use_coreml) {
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
||||
ggml_allocr_alloc(alloc, cur);
|
||||
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef WHISPER_USE_OPENVINO
|
||||
else if (use_openvino) {
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
||||
ggml_allocr_alloc(alloc, cur);
|
||||
|
||||
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
|
||||
return false;
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
wstate.embd_enc = cur;
|
||||
|
||||
//ggml_graph_print(gf);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
||||
@ -1755,14 +1766,16 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
//ggml_allocr * alloc = wstate.alloc_encode_post;
|
||||
ggml_allocr * alloc = wstate.alloc_encode_post;
|
||||
|
||||
struct ggml_tensor * cur = wstate.embd_enc;
|
||||
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
||||
|
||||
// TODO: hack to disconnect the encoded features from the previous graph
|
||||
cur->op = GGML_OP_NONE;
|
||||
cur->src[0] = nullptr;
|
||||
cur->src[1] = nullptr;
|
||||
struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(alloc, Kscale);
|
||||
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25));
|
||||
}
|
||||
|
||||
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
||||
auto & layer = model.layers_decoder[il];
|
||||
@ -1771,7 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
|
||||
layer.cross_attn_k_w,
|
||||
cur);
|
||||
|
||||
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
|
||||
Kcross = ggml_scale(ctx0, Kcross, Kscale);
|
||||
|
||||
struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
|
||||
layer.cross_attn_v_w,
|
||||
@ -1794,6 +1807,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v));
|
||||
}
|
||||
|
||||
//ggml_graph_print(gf);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
@ -1826,7 +1841,26 @@ static bool whisper_encode_internal(
|
||||
|
||||
ggml_allocr_alloc_graph(alloc, gf);
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
#else
|
||||
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
|
||||
#endif
|
||||
|
||||
//auto cur = wstate.embd_enc;
|
||||
////auto cur = gf->leafs[0];
|
||||
//printf("cur name = '%s'\n", cur->name);
|
||||
|
||||
//float * res = (float *) cur->data;
|
||||
//for (int i = 0; i < 10; ++i) {
|
||||
// printf("%f ", res[i]);
|
||||
//}
|
||||
//printf("\n");
|
||||
//double sum = 0.0;
|
||||
//for (int i = 0; i < ggml_nelements(cur); ++i) {
|
||||
// sum += res[i];
|
||||
//}
|
||||
//printf("sum: %f\n", sum);
|
||||
//printf("n: %d\n", ggml_nelements(cur));
|
||||
}
|
||||
|
||||
// encoder_post
|
||||
@ -1840,6 +1874,21 @@ static bool whisper_encode_internal(
|
||||
ggml_allocr_alloc_graph(alloc, gf);
|
||||
|
||||
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
|
||||
|
||||
//auto cur = gf->nodes[gf->n_nodes - 1];
|
||||
//printf("cur name = '%s'\n", cur->name);
|
||||
|
||||
//ggml_fp16_t * res = (ggml_fp16_t *) cur->data;
|
||||
//for (int i = 0; i < 10; ++i) {
|
||||
// printf("%f ", ggml_fp32_to_fp16(res[i]));
|
||||
//}
|
||||
//printf("\n");
|
||||
//double sum = 0.0;
|
||||
//for (int i = 0; i < ggml_nelements(cur); ++i) {
|
||||
// sum += ggml_fp32_to_fp16(res[i]);
|
||||
//}
|
||||
//printf("sum: %f\n", sum);
|
||||
//printf("n: %d\n", ggml_nelements(cur));
|
||||
}
|
||||
|
||||
// ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
@ -1902,6 +1951,13 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(alloc, KQscale);
|
||||
|
||||
if (!ggml_allocr_is_measure(alloc)) {
|
||||
ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25));
|
||||
}
|
||||
|
||||
// token encoding + position encoding
|
||||
struct ggml_tensor * cur =
|
||||
ggml_add(ctx0,
|
||||
@ -1937,14 +1993,14 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
||||
Qcur),
|
||||
Qcur);
|
||||
|
||||
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
Qcur = ggml_scale(ctx0, Qcur, KQscale);
|
||||
|
||||
// note: no bias for Key
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_k_w,
|
||||
cur);
|
||||
|
||||
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
Kcur = ggml_scale(ctx0, Kcur, KQscale);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
@ -1988,15 +2044,11 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
//struct ggml_tensor * KQ_scaled =
|
||||
// ggml_scale_inplace(ctx0,
|
||||
// KQ,
|
||||
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
||||
// );
|
||||
//struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
||||
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
@ -2052,7 +2104,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
||||
Qcur),
|
||||
Qcur);
|
||||
|
||||
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
Qcur = ggml_scale(ctx0, Qcur, KQscale);
|
||||
|
||||
// Kcross is already scaled
|
||||
struct ggml_tensor * Kcross =
|
||||
@ -2092,15 +2144,15 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
//struct ggml_tensor * KQ_scaled =
|
||||
// ggml_scale_inplace(ctx0,
|
||||
// ggml_scale(ctx0,
|
||||
// KQ,
|
||||
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
||||
// );
|
||||
|
||||
// no masking for cross-attention
|
||||
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
|
||||
@ -2225,7 +2277,7 @@ static bool whisper_decode_internal(
|
||||
|
||||
// decoder
|
||||
{
|
||||
auto & alloc = wstate.alloc_encode;
|
||||
auto & alloc = wstate.alloc_decode;
|
||||
|
||||
ggml_allocr_reset(alloc);
|
||||
|
||||
@ -2758,8 +2810,9 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
{
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
|
||||
const int n_tokens = hparams.n_text_ctx/2;
|
||||
const int n_past = hparams.n_text_ctx/2; // TODO: double-check
|
||||
// TODO: make sure this is the worst-case scenario
|
||||
const int n_tokens = hparams.n_text_ctx;
|
||||
const int n_past = 0;
|
||||
|
||||
ggml_cgraph * gf = whisper_build_graph_decoder(*ctx, *state, state->decoders[0], NULL, n_tokens, n_past);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user