diff --git a/Makefile b/Makefile
index 605d8d48..7aaade97 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ ifndef NVCC_VERSION
 	endif
 endif
 
-CCV := $(shell $(CC) --version | head -n 1)
+CCV  := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)
 
 # Mac OS + Arm can report x86_64
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 856a4cdb..78977a64 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -6,6 +6,26 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/types.h>
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <memoryapi.h>
+#endif
+
+
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,15 +119,28 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 
-
-static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     return ggml_nbytes(tensor);
 
     UNUSED(alloc);
 }
 
+// check if a tensor is allocated by this buffer
+static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
+}
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->view_src != NULL;
+}
+
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+#ifdef GGML_ALLOCATOR_DEBUG
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
+    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+#endif
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -131,14 +164,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     if (best_fit_block == -1) {
         // the last block is our last resort
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
-            max_avail = MAX(max_avail, block->size);
         } else {
             fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                     __func__, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
-        return;
+            return;
         }
     }
     struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -173,17 +206,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     void * ptr = tensor->data;
 
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+    if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
         return;
     }
 
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
 
@@ -277,17 +310,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
     return alloc;
 }
 
-// address and size of the buffer when measuring
-// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
-static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
-static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+// OS specific functions to allocate and free uncommitted virtual memory
+static void * alloc_vmem(size_t size) {
+#if defined(_WIN32)
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+#elif defined(_POSIX_MAPPED_FILES)
+    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+    return ptr;
+#else
+    // use a fixed address for other platforms
+    uintptr_t base_addr = (uintptr_t)-size - 0x100;
+    return (void *)base_addr;
+#endif
+}
+
+static void free_vmem(void * base_addr, size_t size) {
+#if defined(_WIN32)
+    VirtualFree(base_addr, 0, MEM_RELEASE);
+    UNUSED(size);
+#elif defined(_POSIX_MAPPED_FILES)
+    munmap(base_addr, size);
+#else
+    // nothing to do
+    UNUSED(base_addr);
+    UNUSED(size);
+#endif
+}
+
+// allocate uncommitted virtual memory to measure the size of the graph
+static void alloc_measure_vmem(void ** base_addr, size_t * size) {
+    // 1TB for 64-bit, 1GB for 32-bit
+    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
+    do {
+        *base_addr = alloc_vmem(*size);
+        if (*base_addr != NULL) {
+            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
+            return;
+        }
+        // try again with half the size
+        *size /= 2;
+    } while (*size > 0);
+
+    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
+}
+
+static void free_measure_vmem(void * base_addr, size_t size) {
+    free_vmem(base_addr, size);
+}
 
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
+    void * base_addr;
+    size_t size;
+
+    alloc_measure_vmem(&base_addr, &size);
+
     *alloc = (struct ggml_allocr){
-        /*.data          = */ MEASURE_BASE_ADDR,
-        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.data          = */ base_addr,
+        /*.size          = */ size,
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
@@ -307,6 +391,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 
 void ggml_allocr_free(struct ggml_allocr * alloc) {
+    if (alloc->measure) {
+        free_measure_vmem(alloc->data, alloc->size);
+    }
     free(alloc);
 }
 
@@ -316,11 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
 
 //////////// compute graph allocator
 
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
-           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
-}
-
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
     if (a->type != b->type) {
         return false;
@@ -336,28 +418,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
     return true;
 }
 
-static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
-    switch (t->op) {
-        case GGML_OP_PERMUTE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_VIEW:
-            return t->src[0];
-        case GGML_OP_CPY:
-            return t->src[1];
-        default:
-            return NULL;
-    }
-}
-
-static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
-    struct ggml_tensor * parent = t;
-    do {
-        parent = get_view_parent(parent);
-    } while (ggml_is_view(parent));
-    return parent;
-}
-
 static bool ggml_op_can_inplace(enum ggml_op op) {
     switch (op) {
         case GGML_OP_SCALE:
@@ -365,7 +425,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
-        case GGML_OP_ACC:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
@@ -375,10 +434,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_UNARY:
         case GGML_OP_ROPE:
         case GGML_OP_RMS_NORM:
-        case GGML_OP_SET:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_CONT:
-        case GGML_OP_ADD_REL_POS:
             return true;
 
         default:
@@ -390,24 +447,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     struct hash_node * ht = alloc->hash_table;
     if (node->data == NULL) {
         if (ggml_is_view(node)) {
-            size_t offset;
-            switch(node->op) {
-                case GGML_OP_VIEW:
-                    memcpy(&offset, node->op_params, sizeof(size_t));
-                    node->data = (char *) node->src[0]->data + offset;
-                    break;
-                case GGML_OP_PERMUTE:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_TRANSPOSE:
-                    node->data = node->src[0]->data;
-                    break;
-                case GGML_OP_CPY:
-                    node->data = node->src[1]->data;
-                    break;
-                default:
-                    GGML_ASSERT(!"unknown view op");
-                    break;
-            }
+            assert(node->view_src->data != NULL);
+            node->data = (char *)node->view_src->data + node->view_offs;
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
@@ -418,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     }
 
                     // if the node's data is external, then we cannot re-use it
-                    if ((char *) parent->data < (char *) alloc->data ||
-                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                    if (ggml_allocr_is_own(alloc, parent) == false) {
                         AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                         continue;
                     }
@@ -427,7 +467,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     struct hash_node * p_hn = hash_get(ht, parent);
                     if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                         if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct ggml_tensor * view_src = parent->view_src;
                             struct hash_node * view_src_hn = hash_get(ht, view_src);
                             if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                                 // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -453,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     }
 }
 
-static size_t ggml_allocator_alloc_graph_tensors_n(
+static size_t ggml_allocr_alloc_graph_tensors_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -469,7 +509,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
             struct ggml_tensor * node = gf->nodes[i];
 
             if (ggml_is_view(node)) {
-                struct ggml_tensor * view_src = get_view_source(node);
+                struct ggml_tensor * view_src = node->view_src;
                 hash_get(ht, view_src)->n_views += 1;
             }
 
@@ -531,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 AT_PRINTF("\n");
             }
 
-
             // update parents
             // update immediately if there is no parse_seq
             // update only at barriers if there is parse_seq
-            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
                 int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                 int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                 for (int i = update_start; i < update_end; i++) {
@@ -554,17 +593,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 
                         if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                             if (ggml_is_view(parent)) {
-                                struct ggml_tensor * view_src = get_view_source(parent);
+                                struct ggml_tensor * view_src = parent->view_src;
                                 struct hash_node * view_src_hn = hash_get(ht, view_src);
                                 view_src_hn->n_views -= 1;
                                 AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                 if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, view_src);
+                                    ggml_allocr_free_tensor(alloc, view_src);
                                 }
                             }
                             else {
                                 if (parent->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, parent);
+                                    ggml_allocr_free_tensor(alloc, parent);
                                 }
                             }
                         }
@@ -581,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
             for (int i = 0; outputs[g][i] != NULL; i++) {
                 struct ggml_tensor * output = outputs[g][i];
                 AT_PRINTF("output: %s\n", output->name);
-                ggml_allocator_free_tensor(alloc, output);
+                ggml_allocr_free_tensor(alloc, output);
             }
         }
     }
@@ -590,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 }
 
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
diff --git a/ggml.c b/ggml.c
index 3f72379c..f8cd58df 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18337,10 +18337,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * node = cgraph->leafs[i];
 
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
+        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
                 i,
                 node->ne[0], node->ne[1],
-                ggml_op_name(node->op));
+                ggml_op_name(node->op),
+                ggml_get_name(node));
     }
 
     for (int i = 0; i < GGML_OP_COUNT; i++) {
diff --git a/whisper.cpp b/whisper.cpp
index 6ceea676..52cc1c97 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1410,8 +1410,6 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
 
     const int n_mels = hparams.n_mels;
 
-    assert(mel_inp.n_mel == n_mels);
-
     struct ggml_init_params params = {
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
@@ -1429,6 +1427,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
 
     assert(mel->type == GGML_TYPE_F32);
     if (!ggml_allocr_is_measure(alloc)) {
+        assert(mel_inp.n_mel == n_mels);
+
         float * dst = (float *) mel->data;
         memset(dst, 0, ggml_nbytes(mel));
 
@@ -1442,6 +1442,15 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
         }
     }
 
+    ggml_build_forward_expand(gf, mel);
+
+    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, KQscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head));
+    }
+
     struct ggml_tensor * cur;
 
 #ifndef WHISPER_USE_COREML
@@ -1533,14 +1542,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
                             Qcur),
                         Qcur);
 
-                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                         layer.attn_k_w,
                         cur);
 
-                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                         layer.attn_v_w,
@@ -1597,13 +1606,9 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
                 // K * Q
                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
-                struct ggml_tensor * KQ_scaled =
-                    ggml_scale_inplace(ctx0,
-                            KQ,
-                            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-                            );
+                struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
 
-                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
+                struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
 
                 struct ggml_tensor * V =
                     ggml_cpy(ctx0,
@@ -1698,28 +1703,34 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
                         cur),
                     ggml_repeat(ctx0, model.e_ln_b, cur));
         }
-
-        ggml_build_forward_expand  (gf, cur);
     }
 #ifdef WHISPER_USE_COREML
     else if (use_coreml) {
         cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        ggml_allocr_alloc(alloc, cur);
 
-        whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+        if (!ggml_allocr_is_measure(alloc)) {
+            whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+        }
     }
 #endif
 #ifdef WHISPER_USE_OPENVINO
     else if (use_openvino) {
         cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        ggml_allocr_alloc(alloc, cur);
 
-        if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
-            return false;
+        if (!ggml_allocr_is_measure(alloc)) {
+            whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
         }
     }
 #endif
 
+    ggml_build_forward_expand(gf, cur);
+
     wstate.embd_enc = cur;
 
+    //ggml_graph_print(gf);
+
     ////////////////////////////////////////////////////////////////////////////
 
     //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
@@ -1755,14 +1766,16 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
 
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    //ggml_allocr * alloc = wstate.alloc_encode_post;
+    ggml_allocr * alloc = wstate.alloc_encode_post;
 
-    struct ggml_tensor * cur = wstate.embd_enc;
+    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
 
-    // TODO: hack to disconnect the encoded features from the previous graph
-    cur->op = GGML_OP_NONE;
-    cur->src[0] = nullptr;
-    cur->src[1] = nullptr;
+    struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, Kscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25));
+    }
 
     for (int il = 0; il < model.hparams.n_text_layer; ++il) {
         auto & layer = model.layers_decoder[il];
@@ -1771,7 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
                 layer.cross_attn_k_w,
                 cur);
 
-        Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+        Kcross = ggml_scale(ctx0, Kcross, Kscale);
 
         struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
                 layer.cross_attn_v_w,
@@ -1794,6 +1807,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder_post(
         ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v));
     }
 
+    //ggml_graph_print(gf);
+
     ggml_free(ctx0);
 
     return gf;
@@ -1826,7 +1841,26 @@ static bool whisper_encode_internal(
 
         ggml_allocr_alloc_graph(alloc, gf);
 
+#ifdef WHISPER_USE_COREML
+#else
         ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+#endif
+
+        //auto cur = wstate.embd_enc;
+        ////auto cur = gf->leafs[0];
+        //printf("cur name = '%s'\n", cur->name);
+
+        //float * res = (float *) cur->data;
+        //for (int i = 0; i < 10; ++i) {
+        //    printf("%f ", res[i]);
+        //}
+        //printf("\n");
+        //double sum = 0.0;
+        //for (int i = 0; i < ggml_nelements(cur); ++i) {
+        //    sum += res[i];
+        //}
+        //printf("sum: %f\n", sum);
+        //printf("n: %d\n", ggml_nelements(cur));
     }
 
     // encoder_post
@@ -1840,6 +1874,21 @@ static bool whisper_encode_internal(
         ggml_allocr_alloc_graph(alloc, gf);
 
         ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+
+        //auto cur = gf->nodes[gf->n_nodes - 1];
+        //printf("cur name = '%s'\n", cur->name);
+
+        //ggml_fp16_t * res = (ggml_fp16_t *) cur->data;
+        //for (int i = 0; i < 10; ++i) {
+        //    printf("%f ", ggml_fp32_to_fp16(res[i]));
+        //}
+        //printf("\n");
+        //double sum = 0.0;
+        //for (int i = 0; i < ggml_nelements(cur); ++i) {
+        //    sum += ggml_fp32_to_fp16(res[i]);
+        //}
+        //printf("sum: %f\n", sum);
+        //printf("n: %d\n", ggml_nelements(cur));
     }
 
     // ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
@@ -1902,6 +1951,13 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
         }
     }
 
+    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, KQscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25));
+    }
+
     // token encoding + position encoding
     struct ggml_tensor * cur =
         ggml_add(ctx0,
@@ -1937,14 +1993,14 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
                         Qcur),
                     Qcur);
 
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale(ctx0, Qcur, KQscale);
 
             // note: no bias for Key
             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
 
-            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = ggml_scale(ctx0, Kcur, KQscale);
 
             // store key and value to memory
             {
@@ -1988,15 +2044,11 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
-            //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-            //            );
+            //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
 
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
 
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
 
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
@@ -2052,7 +2104,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
                         Qcur),
                     Qcur);
 
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale(ctx0, Qcur, KQscale);
 
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
@@ -2092,15 +2144,15 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
+            //    ggml_scale(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
 
             // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
 
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
 
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 
@@ -2225,7 +2277,7 @@ static bool whisper_decode_internal(
 
     // decoder
     {
-        auto & alloc = wstate.alloc_encode;
+        auto & alloc = wstate.alloc_decode;
 
         ggml_allocr_reset(alloc);
 
@@ -2758,8 +2810,9 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     {
         const auto & hparams = ctx->model.hparams;
 
-        const int n_tokens = hparams.n_text_ctx/2;
-        const int n_past   = hparams.n_text_ctx/2; // TODO: double-check
+        // TODO: make sure this is the worst-case scenario
+        const int n_tokens = hparams.n_text_ctx;
+        const int n_past   = 0;
 
         ggml_cgraph * gf = whisper_build_graph_decoder(*ctx, *state, state->decoders[0], NULL, n_tokens, n_past);