llama podcast

2024-12-19 20:57:52 +00:00 · 2023-04-01 10:33:21 +03:00 · 2023-04-01 10:33:21 +03:00 · c456ca476b
commit c456ca476b
parent 0a2d1210bc
13 changed files with 1359 additions and 1074 deletions
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@ -12,6 +12,19 @@
 #include <cassert>
 #include <cstring>

+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+
+#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
+#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
+
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16

@ -142,6 +155,10 @@ struct llama_model {
    // the model memory buffer
    std::vector<uint8_t> buf;

+    // model memory mapped file
+    void * mm_addr = NULL;
+    uint64_t mm_length = 0;
+
    // tensors
    int n_loaded;
    std::unordered_map<std::string, struct ggml_tensor *> tensors;
@ -165,6 +182,7 @@ struct llama_context {

    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
+    bool has_evaluated_once = false;

    int64_t t_sample_us = 0;
    int64_t t_eval_us   = 0;
@ -206,7 +224,7 @@ struct llama_context {
        }

        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+            buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
        }

        buf_last = i;
@ -246,6 +264,7 @@ static bool kv_cache_init(
    struct ggml_init_params params;
    params.mem_size   = cache.buf.size();
    params.mem_buffer = cache.buf.data();
+    params.no_alloc   = false;

    cache.ctx = ggml_init(params);

@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
 // model loading
 //

+static void *mmap_file(const char *fname, uint64_t *mm_length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    HANDLE hFile = CreateFileA(fname,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
+                               NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return 0;
+    LARGE_INTEGER fileSize;
+    fileSize.QuadPart = -1;
+    GetFileSizeEx(hFile, &fileSize);
+    int64_t length = fileSize.QuadPart;
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+    CloseHandle(hFile);
+    if (!hMapping) return 0;
+    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(hMapping);
+    if (!addr) return 0;
+#else
+    int fd = open(fname, O_RDONLY);
+    if (fd == -1) return 0;
+    int64_t length = lseek(fd, 0, SEEK_END);
+    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) return 0;
+#endif
+    *mm_length = length;
+    return addr;
+}
+
+static void munmap_file(void * addr, size_t length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    UnmapViewOfFile(addr);
+#else
+    munmap(addr, length);
+#endif
+}
+
+static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
+    fprintf(stderr,
+            "%s: invalid model file (bad magic [got %#x want %#x])\n"
+            "\tyou most likely need to regenerate your ggml files\n"
+            "\tthe benefit is you'll get 10-100x faster load times\n"
+            "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
+            "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
+            "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
+            path, got, want);
+    return false;
+}
+
 static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
@ -299,34 +370,35 @@ static bool llama_model_load(
        void *progress_callback_user_data) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

-    const int64_t t_start_us = ggml_time_us();
-
-    lctx.t_start_us = t_start_us;
-
-    std::vector<char> f_buf(1024*1024);
+    lctx.t_start_us = ggml_time_us();

    auto & model = lctx.model;
    auto & vocab = lctx.vocab;

    auto fin = std::ifstream(fname, std::ios::binary);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

+    std::vector<char> f_buf(1024*1024);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+
+    fin.seekg(0, fin.end);
+    const size_t file_size = fin.tellg();
+    fin.seekg(0);
+
    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
-            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
                    __func__, fname.c_str());
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
        }

        uint32_t format_version;
@ -449,43 +521,24 @@ static bool llama_model_load(
                }
    }

+    // map model into memory
+    char *mm_addr = NULL;
+    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
+    if (model.mm_addr == NULL) {
+        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+    mm_addr = (char *)model.mm_addr;
+    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
+
    auto & ctx = model.ctx;

    size_t ctx_size = 0;
-
    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
+        const auto &hparams = model.hparams;
        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
-
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
-
        ctx_size += (5 + 10*n_layer)*256; // object overhead
-
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
    }

    // print memory requirements
@ -495,6 +548,7 @@ static bool llama_model_load(
        // this is the total memory required to run the inference
        const size_t mem_required =
            ctx_size +
+            model.mm_length +
            MEM_REQ_SCRATCH0.at(model.type) +
            MEM_REQ_SCRATCH1.at(model.type) +
            MEM_REQ_EVAL.at    (model.type);
@ -514,6 +568,7 @@ static bool llama_model_load(
        struct ggml_init_params params = {
            /*.mem_size   =*/ lctx.model.buf.size(),
            /*.mem_buffer =*/ lctx.model.buf.data(),
+            /*.no_alloc   =*/ true,
        };

        model.ctx = ggml_init(params);
@ -576,234 +631,106 @@ static bool llama_model_load(
        }
    }

-    const size_t file_offset = fin.tellg();
-
-    fin.close();
-
    std::vector<uint8_t> tmp;

    if (progress_callback) {
        progress_callback(0.0, progress_callback_user_data);
    }

-    for (int i = 0; i < n_parts; ++i) {
-        const int part_id = i;
-        //const int part_id = n_parts - i - 1;
+    fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());

-        std::string fname_part = fname;
-        if (i > 0) {
-            fname_part += "." + std::to_string(i);
-        }
+    // load weights
+    {
+        size_t total_size = 0;
+        model.n_loaded = 0;

-        fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;

-        fin = std::ifstream(fname_part, std::ios::binary);
-        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

-        fin.seekg(0, fin.end);
-        const size_t file_size = fin.tellg();
-
-        fin.seekg(file_offset);
-
-        // load weights
-        {
-            size_t total_size = 0;
-
-            model.n_loaded = 0;
-
-            fprintf(stderr, "%s: ", __func__);
-
-            while (true) {
-                int32_t n_dims;
-                int32_t length;
-                int32_t ftype;
-
-                fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-                fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-                fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-                if (fin.eof()) {
-                    break;
-                }
-
-                int32_t nelements = 1;
-                int32_t ne[2] = { 1, 1 };
-                for (int i = 0; i < n_dims; ++i) {
-                    fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                    nelements *= ne[i];
-                }
-
-                std::string name(length, 0);
-                fin.read(&name[0], length);
-
-                if (model.tensors.find(name.data()) == model.tensors.end()) {
-                    fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                    return false;
-                }
-
-                // split_type = 0: split by columns
-                // split_type = 1: split by rows
-                int split_type = 0;
-
-                // split_type = 0:
-                // regex:
-                //   - tok_embeddings.*
-                //   - layers.*.attention.wo.weight
-                //   - layers.*.feed_forward.w2.weight
-
-                // split_type = 1:
-                // regex:
-                //   - output.*
-                //   - layers.*.attention.wq.weight
-                //   - layers.*.attention.wk.weight
-                //   - layers.*.attention.wv.weight
-                //   - layers.*.feed_forward.w1.weight
-                //   - layers.*.feed_forward.w3.weight
-                if (name.find("tok_embeddings") != std::string::npos) {
-                    split_type = 0;
-                } else if (name.find("layers") != std::string::npos) {
-                    if (name.find("attention.wo.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else {
-                        split_type = 1;
-                    }
-                } else if (name.find("output") != std::string::npos) {
-                    split_type = 1;
-                }
-
-                auto tensor = model.tensors[name.data()];
-
-                if (n_dims == 1) {
-                    if (ggml_nelements(tensor) != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                } else {
-                    if (ggml_nelements(tensor)/n_parts != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                }
-
-                if (n_dims == 1) {
-                    if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                        return false;
-                    }
-                } else {
-                    if (split_type == 0) {
-                        if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
-                            return false;
-                        }
-                    } else {
-                        if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
-                            return false;
-                        }
-                    }
-                }
-
-                if (0) {
-                    static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
-                }
-
-                size_t bpe = 0;
-
-                switch (ftype) {
-                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                    default:
-                            {
-                                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                                return false;
-                            }
-                };
-
-                if (n_dims == 1 || n_parts == 1) {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                        return false;
-                    }
-
-                    if (part_id == 0) {
-                        fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-                    } else {
-                        fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-                    }
-
-                    total_size += ggml_nbytes(tensor);
-                } else {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
-                        return false;
-                    }
-
-                    if (split_type == 0) {
-                        const int np0 = ne[0];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                        assert(row_size == tensor->nb[1]);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = i1*row_size;
-                            const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
-                        }
-                    } else {
-                        const int np1 = ne[1];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = (i1 + part_id*np1)*row_size;
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
-                        }
-                    }
-
-                    total_size += ggml_nbytes(tensor)/n_parts;
-                }
-
-                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                model.n_loaded++;
-
-                // progress
-                if (progress_callback) {
-                    double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
-                    double current_progress = (double(i) + current_file_progress) / double(n_parts);
-                    progress_callback(current_progress, progress_callback_user_data);
-                }
-                if (model.n_loaded % 8 == 0) {
-                    fprintf(stderr, ".");
-                    fflush(stderr);
-                }
+            if (fin.eof()) {
+                break;
            }

-            fprintf(stderr, " done\n");
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }

-            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
-            if (model.n_loaded == 0) {
-                fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-            } else if (model.n_loaded != (int) model.tensors.size()) {
-                fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
+
+            auto tensor = model.tensors[name.data()];
+
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (0) {
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
+            }
+
+            switch (ftype) {
+                case 0:  // f32
+                case 1:  // f16
+                    break;
+                case 2:  // q4_0
+                case 3:  // q4_1
+                    assert(ne[0] % 64 == 0);
+                    break;
+                default:
+                    fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                    return false;
+            };
+
+            // load the tensor data into memory without copying or reading it
+            size_t offset = fin.tellg();
+            size_t tensor_data_size = ggml_nbytes(tensor);
+            offset = (offset + 31) & -32;
+            tensor->data = mm_addr + offset;
+            fin.seekg(offset + tensor_data_size);
+            total_size += tensor_data_size;
+            model.n_loaded++;
+
+            // progress
+            if (progress_callback) {
+                double current_progress = size_t(fin.tellg()) / double(file_size);
+                progress_callback(current_progress, progress_callback_user_data);
+            }
        }

        fin.close();
+
+        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
+        if (model.n_loaded == 0) {
+            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
    }

-    lctx.t_load_us = ggml_time_us() - t_start_us;
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;

    if (progress_callback) {
        progress_callback(1.0, progress_callback_user_data);
@ -849,6 +776,7 @@ static bool llama_eval_internal(
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute.size(),
        /*.mem_buffer =*/ buf_compute.data(),
+        /*.no_alloc   =*/ false,
    };

    struct ggml_context * ctx0 = ggml_init(params);
@ -856,7 +784,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -922,7 +850,7 @@ static bool llama_eval_internal(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@ -1126,7 +1054,7 @@ struct llama_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -1240,12 +1168,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
 // sampling
 //

-static void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
+static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
    // find the top k tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
+            [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
        return a.first > b.first;
    });

@ -1256,9 +1184,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
        llama_context & lctx,
        const std::vector<llama_vocab::id> & last_n_tokens,
        int top_k,
-        double top_p,
-        double temp,
-        double repeat_penalty) {
+        float top_p,
+        float temp,
+        float repeat_penalty) {
    auto & rng = lctx.rng;

    const int n_logits = lctx.model.hparams.n_vocab;
@ -1266,17 +1194,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
    const auto & logits = lctx.logits;
    const auto * plogits = logits.data() + logits.size() - n_logits;

-    std::vector<std::pair<double, llama_vocab::id>> logits_id;
+    std::vector<std::pair<float, llama_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

    {
-        const double scale = 1.0/temp;
+        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0) {
+                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
@ -1289,18 +1217,18 @@ static llama_vocab::id llama_sample_top_p_top_k(

    sample_top_k(logits_id, top_k);

-    double maxl = -std::numeric_limits<double>::infinity();
+    float maxl = -std::numeric_limits<float>::infinity();
    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
+        maxl = Max(maxl, kv.first);
    }

    // compute probs for the top k tokens
-    std::vector<double> probs;
+    std::vector<float> probs;
    probs.reserve(logits_id.size());

    double sum = 0.0;
    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
+        const float p = expf(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
@ -1310,8 +1238,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
        p /= sum;
    }

-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
+    if (top_p < 1.0) {
+        double cumsum = 0.0;
        for (int i = 0; i < (int) probs.size(); i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
@ -1345,7 +1273,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
 //

 // TODO: reuse code from the llama_model_load() somehow
-bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) {
+static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
    ggml_type type = GGML_TYPE_Q4_1;

    switch (itype) {
@ -1385,8 +1313,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
+            return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
        }

        fout.write((char *) &magic, sizeof(magic));
@ -1444,7 +1371,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            return false;
        }

-        std::string word;
+        std::vector<char> word(32);
        vocab.id_to_token.resize(n_vocab);
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
@ -1452,17 +1379,17 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            fout.write((char *) &len, sizeof(len));

            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
+            finp.read ((char *) &word[0], len);
+            fout.write((char *) &word[0], len);

            float score;
            finp.read ((char *) &score, sizeof(score));
            fout.write((char *) &score, sizeof(score));

-            vocab.token_to_id[word] = i;
+            vocab.token_to_id[word.data()] = i;

            auto &tok_score = vocab.id_to_token[i];
-            tok_score.tok = word;
+            tok_score.tok = word.data();
            tok_score.score = score;
        }
    }
@ -1503,6 +1430,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            std::string name(length, 0);
            finp.read (&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = finp.tellg();
+                offset = (offset + 31) & -32;
+                finp.seekg(offset);
+            }
+
            {
                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@ -1558,6 +1492,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            }
            fout.write(&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = fout.tellp();
+                offset = (offset + 31) & -32;
+                fout.seekp(offset);
+            }
+
            if (quantize) {
                printf("quantizing .. ");
                work.resize(nelements); // for quantization
@ -1568,11 +1509,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
                switch (type) {
                    case GGML_TYPE_Q4_0:
                        {
-                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                        } break;
                    case GGML_TYPE_Q4_1:
                        {
-                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                        } break;
                    default:
                        {
@ -1590,7 +1531,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
                }

                for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                    printf("%5.3f ", hist_cur[i] / (float)nelements);
+                    printf("%5.3f ", hist_cur[i] / float(nelements));
                }
                printf("\n");
            } else {
@ -1613,7 +1554,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str

            printf("%s: hist: ", __func__);
            for (int i = 0; i < (int) hist_all.size(); ++i) {
-                printf("%5.3f ", hist_all[i] / (float)sum_all);
+                printf("%5.3f ", hist_all[i] / float(sum_all));
            }
            printf("\n");
        }
@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(

    if (params.use_mlock) {
        char *err;
-        if (!ggml_mlock(ctx->model.ctx, &err)) {
+        if (!ggml_mlock(ctx->model.ctx,
+                        ctx->model.mm_addr,
+                        ctx->model.mm_length,
+                        &err)) {
            fprintf(stderr, "%s\n", err);
            free(err);
            llama_free(ctx);
@ -1705,15 +1649,18 @@ void llama_free(struct llama_context * ctx) {
        ggml_free(ctx->model.ctx);
    }

+    if (ctx->model.mm_addr) {
+        munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
+    }
+
    delete ctx;
 }

 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
-               int   itype,
-               int   qk) {
-    if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) {
+               int   itype) {
+    if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
        fprintf(stderr, "%s: failed to quantize\n", __func__);
        return 1;
    }
@ -1731,7 +1678,11 @@ int llama_eval(
        fprintf(stderr, "%s: failed to eval\n", __func__);
        return 1;
    }
-
+    // get a more accurate load time, upon first eval
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
    return 0;
 }

@ -1796,9 +1747,9 @@ llama_token llama_sample_top_p_top_k(
      const llama_token * last_n_tokens_data,
                    int   last_n_tokens_size,
                    int   top_k,
-                 double   top_p,
-                 double   temp,
-                 double   repeat_penalty) {
+                  float   top_p,
+                  float   temp,
+                  float   repeat_penalty) {
    const int64_t t_start_sample_us = ggml_time_us();

    llama_token result = 0;
@ -1824,21 +1775,20 @@ llama_token llama_sample_top_p_top_k(
 void llama_print_timings(struct llama_context * ctx) {
    const int64_t t_end_us = ggml_time_us();

-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+    const int32_t n_sample = Max(1, ctx->n_sample);
+    const int32_t n_eval   = Max(1, ctx->n_eval);
+    const int32_t n_p_eval = Max(1, ctx->n_p_eval);

    fprintf(stderr, "\n");
-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
-    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3f * ctx->t_eval_us,   n_eval,   1e-3f * ctx->t_eval_us   / n_eval);
-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
+    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
+    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
+    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
 }

 void llama_reset_timings(struct llama_context * ctx) {
    ctx->t_start_us = ggml_time_us();
-
    ctx->t_sample_us = ctx->n_sample = 0;
    ctx->t_eval_us   = ctx->n_eval   = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -6,7 +6,7 @@
 #include <stdbool.h>

 #ifdef LLAMA_SHARED
-#    ifdef _WIN32
+#    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define LLAMA_API __declspec(dllexport)
 #        else
@ -20,7 +20,7 @@
 #endif

 #define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
+#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files

 #ifdef __cplusplus
@ -45,7 +45,7 @@ extern "C" {

    } llama_token_data;

-    typedef void (*llama_progress_callback)(double progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
        int n_ctx;   // text context
@ -81,8 +81,7 @@ extern "C" {
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype,
-                   int   qk);
+                   int   itype);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@ -135,9 +134,9 @@ extern "C" {
          const llama_token * last_n_tokens_data,
                        int   last_n_tokens_size,
                        int   top_k,
-                     double   top_p,
-                     double   temp,
-                     double   repeat_penalty);
+                      float   top_p,
+                      float   temp,
+                      float   repeat_penalty);

    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
--- a/examples/talk-llama/speak.sh
+++ b/examples/talk-llama/speak.sh
@ -10,7 +10,15 @@
 #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"

 # for Mac
-say "$2"
+if [ "$1" = "0" ]; then
+    say "$2"
+elif [ "$1" = "1" ]; then
+    say -v "Samantha (Enhanced)" "$2"
+elif [ "$1" = "2" ]; then
+    say -v "Daniel (Enhanced)" "$2"
+elif [ "$1" = "3" ]; then
+    say -v "Veena (Enhanced)" "$2"
+fi

 # Eleven Labs
 #
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -6,6 +6,7 @@
 #include "whisper.h"
 #include "llama.h"

+#include <map>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
@ -28,14 +29,15 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t voice_id   = 0;
    int32_t voice_ms   = 10000;
    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
+    int32_t max_tokens = 64;
    int32_t audio_ctx  = 0;

    int32_t n_parts_llama = -1;

-    float vad_thold    = 0.6f;
+    float vad_thold    = 0.4f;
    float freq_thold   = 100.0f;

    bool speed_up      = false;
@ -45,7 +47,8 @@ struct whisper_params {
    bool no_timestamps = true;
    bool verbose_prompt = false;

-    std::string person      = "Georgi";
+    std::string name_ni     = "Georgi"; // natural    intelligence
+    std::string name_ai     = "LLaMA";  // artificial intelligence
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
@ -64,24 +67,26 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "--n-parts-llama")                  { params.n_parts_llama = std::stoi(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads      = std::stoi(argv[++i]); }
+        else if (arg == "-vid" || arg == "--voice-id")      { params.voice_id       = std::stoi(argv[++i]); }
+        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms       = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id     = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens     = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx      = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold      = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold     = std::stof(argv[++i]); }
+        else if (arg == "--n-parts-llama")                  { params.n_parts_llama  = std::stoi(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up       = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate      = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special  = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy   = true; }
        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
-        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
+        else if (arg == "-nni" || arg == "--name-ni")       { params.name_ni        = argv[++i]; }
+        else if (arg == "-nai" || arg == "--name-ai")       { params.name_ai        = argv[++i]; }
+        else if (arg == "-l"   || arg == "--language")      { params.language       = argv[++i]; }
+        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp      = argv[++i]; }
+        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama    = argv[++i]; }
+        else if (arg == "-s"   || arg == "--speak")         { params.speak          = argv[++i]; }
        else if (arg == "--prompt-file")                    {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
@ -107,6 +112,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -vid N,   --voice-id N    [%-7d] voice ID\n",                                    params.voice_id);
    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
@ -117,7 +123,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
+    fprintf(stderr, "  -nni NAME,--name-ni NAME  [%-7s] natural intelligence name\n",                   params.name_ni.c_str());
+    fprintf(stderr, "  -nai NAME,--name-ai NAME  [%-7s] artificial intelligence name\n",                params.name_ai.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
@ -157,7 +164,7 @@ std::string transcribe(
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
-    wparams.n_threads        = params.n_threads;
+    wparams.n_threads        = 2;

    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
@ -165,6 +172,10 @@ std::string transcribe(
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;

+    static int iter = params.voice_id;
+    std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
+    iter = (iter + 1) % 4;
+
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }
@ -197,25 +208,87 @@ std::string transcribe(
    return result;
 }

-const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
+const std::vector<std::string> k_participants = {
+    "LLaMA",
+    "GGaMA",
+    "SSaMA",
+    "RRaMA",
+};

-const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
+// homophones
+const std::map<std::string, std::vector<std::string>> k_homophones = {
+    { "LLaMA", { "llama", "Llama", "LLAMA", }, },
+    { "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
+    { "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
+    { "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
+};
+
+const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
+
+const std::map<std::string, std::string> k_prompt = {
+    {
+        k_participants.at(0),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
 The transcript only includes text, it does not include markup like HTML and Markdown.
-{1} responds with short and concise answers.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
+{1} is leading the conversation and asking the questions.

-{0}{4} Hello, {1}!
-{1}{4} Hello {0}! How may I help you today?
-{0}{4} What time is it?
-{1}{4} It is {2} o'clock.
-{0}{4} What year is it?
-{1}{4} We are in {3}.
-{0}{4} What is a cat?
-{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-{0}{4} Name a color.
-{1}{4} Blue
-{0}{4})";
+{1}{4} Hello {10}! What is your opinion on the current state of the world?
+{10}{4} Great question {1}! I think we live in a very interesting time.
+There are many things to be concerned about, but also many things to be optimistic about.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{10}{4} I would tell them to be patient and to not be afraid to fail.
+It is important to learn from your mistakes and to keep trying.
+{1}{4})"
+    },
+    {
+        k_participants.at(1),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {11}! What is your opinion on the current state of the world?
+{11}{4} The world is about to experience a major change. We are on the verge of a new era.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{11}{4} My advice would be to be open minded and to be willing to learn from others.
+{1}{4})"
+    },
+    {
+        k_participants.at(2),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {12}! What is your opinion on the current state of the world?
+{12}{4} Our future is bright. We are living in a time of great opportunity.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{12}{4} I would tell them to be brave and to be willing to take risks.
+{1}{4})"
+    },
+    {
+        k_participants.at(3),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {13}! What is your opinion on the current state of the world?
+{13}{4} The world is a terrible place. It is full of evil and corruption.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{13}{4} I would tell them to be selfish and to never trust anyone.
+{1}{4})"
+    },
+};

 int main(int argc, char ** argv) {
    whisper_params params;
@ -286,21 +359,48 @@ int main(int argc, char ** argv) {
    float prob0 = 0.0f;

    const std::string chat_symb = ":";
-    const std::string bot_name  = "LLaMA";
+
+    const std::string name_ni  = params.name_ni;
+    const std::string name_ai  = params.name_ai;
+
+    // the participant that was referenced last
+    std::string name_ref = name_ni;

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
+    std::string prompt_whisper = k_prompt_whisper;
+
+    prompt_whisper = ::replace(prompt_whisper, "{1}",  name_ni);
+    prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
+    prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
+    prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
+    prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));

    // construct the initial prompt for LLaMA inference
-    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
+    std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;

    // need to have leading ' '
    prompt_llama.insert(0, 1, ' ');

-    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
-    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
+    prompt_llama = ::replace(prompt_llama, "{1}",  name_ni);
+    prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
+    prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
+    prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
+    prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
+
+    {
+        // get date string
+        std::string date_str;
+        {
+            time_t t = time(0);
+            struct tm * now = localtime(&t);
+            char buf[128];
+            strftime(buf, sizeof(buf), "%d/%m/%Y", now);
+            date_str = buf;
+        }
+        prompt_llama = ::replace(prompt_llama, "{1}", date_str);
+    }

    {
        // get time string
@ -343,21 +443,27 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
+        fprintf(stdout, "\n");
+        fprintf(stdout, "%s", prompt_whisper.c_str());
+        fprintf(stdout, "\n");
+
        fprintf(stdout, "\n");
        fprintf(stdout, "%s", prompt_llama.c_str());
+        fprintf(stdout, "\n");
+        fprintf(stdout, "\n");
        fflush(stdout);
    }

    printf("%s : done! start speaking in the microphone\n", __func__);
    printf("\n");
-    printf("%s%s", params.person.c_str(), chat_symb.c_str());
+    printf("%s%s", name_ni.c_str(), chat_symb.c_str());
    fflush(stdout);

    // clear audio buffer
    audio.clear();

    // text inference variables
-    const int voice_id = 2;
+    const int voice_id = params.voice_id;
    const int n_keep   = embd_inp.size();
    const int n_ctx    = llama_n_ctx(ctx_llama);

@ -368,9 +474,15 @@ int main(int argc, char ** argv) {

    // reverse prompts for detecting when it's time to stop speaking
    std::vector<std::string> antiprompts = {
-        params.person + chat_symb,
+        name_ni + chat_symb,
    };

+    for (const auto & p : k_participants) {
+        antiprompts.push_back(p + chat_symb);
+    }
+
+    std::string text_heard_all;
+
    // main loop
    while (is_running) {
        // handle Ctrl + C
@ -386,7 +498,7 @@ int main(int argc, char ** argv) {
        int64_t t_ms = 0;

        {
-            audio.get(2000, pcmf32_cur);
+            audio.get(15000, pcmf32_cur);

            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
@ -432,104 +544,145 @@ int main(int argc, char ** argv) {

                force_speak = false;

-                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + bot_name + chat_symb;
-                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
-                fflush(stdout);
+                if (text_heard[0] != ' ') {
+                    text_heard.insert(0, 1, ' ');
+                }

-                embd = ::llama_tokenize(ctx_llama, text_heard, false);
-
-                // text inference
-                bool done = false;
-                std::string text_to_speak;
-                while (true) {
-                    // predict
-                    if (embd.size() > 0) {
-                        if (n_past + (int) embd.size() > n_ctx) {
-                            n_past = n_keep;
-
-                            // insert n_left/2 tokens at the start of embd from last_n_tokens
-                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
-                            //printf("\n---\n");
-                            //printf("resetting: '");
-                            //for (int i = 0; i < (int) embd.size(); i++) {
-                            //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
-                            //}
-                            //printf("'\n");
-                            //printf("\n---\n");
-                        }
-
-                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
-                            fprintf(stderr, "%s : failed to eval\n", __func__);
-                            return 1;
-                        }
-                    }
-
-                    //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
-
-                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
-                    n_past += embd.size();
-                    embd.clear();
-
-                    if (done) break;
-
-                    {
-                        // out of user input, sample next token
-                        const float top_k          = 5;
-                        const float top_p          = 0.80f;
-                        const float temp           = 0.30f;
-                        const float repeat_penalty = 1.1764f;
-
-                        const int repeat_last_n    = 256;
-
-                        llama_token id = 0;
-
-                        {
-                            auto logits = llama_get_logits(ctx_llama);
-                            logits[llama_token_eos()] = 0;
-
-                            id = llama_sample_top_p_top_k(ctx_llama,
-                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
-                        }
-
-                        if (id != llama_token_eos()) {
-                            // add it to the context
-                            embd.push_back(id);
-
-                            text_to_speak += llama_token_to_str(ctx_llama, id);
-
-                            printf("%s", llama_token_to_str(ctx_llama, id));
-                        }
-                    }
-
-                    {
-                        std::string last_output;
-                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-                            last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
-                        }
-                        last_output += llama_token_to_str(ctx_llama, embd[0]);
-
-                        for (std::string & antiprompt : antiprompts) {
-                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                                done = true;
-                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
-                                fflush(stdout);
-                                break;
-                            }
-                        }
-                    }
-
-                    is_running = sdl_poll_events();
-
-                    if (!is_running) {
-                        break;
+                // replace homophones
+                for (const auto & homophone : k_homophones) {
+                    for (const auto & word : homophone.second) {
+                        text_heard = ::replace(text_heard, word, homophone.first);
                    }
                }

-                text_to_speak = ::replace(text_to_speak, "\"", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                // check which participant was mentioned
+                const auto name_ref_old = name_ref;
+                for (const auto & participant : k_participants) {
+                    if (participant == name_ref) {
+                        continue;
+                    }
+
+                    if (text_heard.find(participant) != std::string::npos) {
+                        name_ref = participant;
+                        break;
+                    }
+                }
+                if (name_ref == name_ref_old && name_ref != name_ai) {
+                    name_ref = name_ni;
+                }
+
+                text_heard += "\n" + name_ref + chat_symb;
+                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
+                fflush(stdout);
+
+                text_heard_all += text_heard;
+                // keep only last 100 characters
+                if (text_heard_all.size() > 100) {
+                    text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
+                }
+
+                if (name_ref != name_ai) {
+                } else {
+                    // text inference
+                    bool done = false;
+                    std::string text_to_speak;
+
+                    embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
+                    text_heard_all.clear();
+
+                    while (true) {
+                        // predict
+                        if (embd.size() > 0) {
+                            if (n_past + (int) embd.size() > n_ctx) {
+                                n_past = n_keep;
+
+                                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                                embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
+
+                                //printf("\n---\n");
+                                //printf("resetting: '");
+                                //for (int i = 0; i < (int) embd.size(); i++) {
+                                //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
+                                //}
+                                //printf("'\n");
+                                //printf("\n---\n");
+                            }
+
+                            if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
+                                fprintf(stderr, "%s : failed to eval\n", __func__);
+                                return 1;
+                            }
+                        }
+
+                        //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
+
+                        embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
+                        n_past += embd.size();
+                        embd.clear();
+
+                        if (done) break;
+
+                        {
+                            // out of user input, sample next token
+                            const float top_k          = 5;
+                            const float top_p          = 0.80f;
+                            const float temp           = 0.20f;
+                            const float repeat_penalty = 1.0764f;
+
+                            const int repeat_last_n    = 256;
+
+                            llama_token id = 0;
+
+                            {
+                                auto logits = llama_get_logits(ctx_llama);
+                                logits[llama_token_eos()] = 0;
+
+                                id = llama_sample_top_p_top_k(ctx_llama,
+                                        embd_inp.data() + std::max(0, n_past - repeat_last_n),
+                                        repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                            }
+
+                            if (id != llama_token_eos()) {
+                                // add it to the context
+                                embd.push_back(id);
+
+                                text_to_speak += llama_token_to_str(ctx_llama, id);
+
+                                printf("%s", llama_token_to_str(ctx_llama, id));
+                            }
+
+                            // new line
+                            if (id == 13) {
+                            }
+                        }
+
+                        {
+                            std::string last_output;
+                            for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
+                                last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
+                            }
+                            last_output += llama_token_to_str(ctx_llama, embd[0]);
+
+                            for (const std::string & antiprompt : antiprompts) {
+                                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+                                    done = true;
+                                    text_to_speak = ::replace(text_to_speak, antiprompt, "");
+                                    fflush(stdout);
+                                    break;
+                                }
+                            }
+                        }
+
+                        is_running = sdl_poll_events();
+
+                        if (!is_running) {
+                            break;
+                        }
+                    }
+
+                    text_to_speak = ::replace(text_to_speak, "\"", "");
+                    system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                }

                audio.clear();

--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -325,9 +325,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ false,
+        };
+

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -528,9 +531,11 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -325,9 +325,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = nullptr;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -528,9 +530,11 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -316,6 +316,7 @@ struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
+    bool   no_alloc;   // don't allocate memory for the tensor data
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

 bool ggml_mlock_supported(void);
-bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+bool ggml_mlock(
+        struct ggml_context * ctx,
+        const void *opt_extra_addr,
+        size_t opt_extra_len,
+        char **err_p);

 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
@ -748,8 +753,8 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //

-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);

 //
 // system info
--- a/talk-ggama.sh
+++ b/talk-ggama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "GGaMA" \
+    -t 8 -vid 1 --speak ./examples/talk-llama/speak.sh
--- a/talk-llama.sh
+++ b/talk-llama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "LLaMA" \
+    -t 8 -vid 0 --speak ./examples/talk-llama/speak.sh
--- a/talk-rrama.sh
+++ b/talk-rrama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "RRaMA" \
+    -t 8 -vid 3 --speak ./examples/talk-llama/speak.sh
--- a/talk-ssama.sh
+++ b/talk-ssama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "SSaMA" \
+    -t 8 -vid 2 --speak ./examples/talk-llama/speak.sh
--- a/whisper.cpp
+++ b/whisper.cpp
@ -654,9 +654,11 @@ static bool kv_cache_init(
                                 int   n_ctx) {
    cache.buf.resize(mem_bytes);

-    struct ggml_init_params params;
-    params.mem_size   = cache.buf.size();
-    params.mem_buffer = cache.buf.data();
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ cache.buf.size(),
+        /*.mem_buffer =*/ cache.buf.data(),
+        /*.no_alloc   =*/ false,
+    };

    cache.ctx = ggml_init(params);

@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {

    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));

-    struct ggml_init_params params;
-    params.mem_size   = cache.buf.size();
-    params.mem_buffer = cache.buf.data();
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ cache.buf.size(),
+        /*.mem_buffer =*/ cache.buf.data(),
+        /*.no_alloc   =*/ false,
+    };

    cache.ctx = ggml_init(params);

@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = wctx.model.buf->size();
-        params.mem_buffer = wctx.model.buf->data();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ wctx.model.buf->size(),
+            /*.mem_buffer =*/ wctx.model.buf->data(),
+            /*.no_alloc   =*/ false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -1344,9 +1350,11 @@ static bool whisper_encode_internal(
    const int n_mels = hparams.n_mels;
    assert(mel_inp.n_mel == n_mels);

-    struct ggml_init_params params;
-    params.mem_size   = wstate.buf_compute.size();
-    params.mem_buffer = wstate.buf_compute.data();
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.buf_compute.size(),
+        /*.mem_buffer =*/ wstate.buf_compute.data(),
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

@ -1797,9 +1805,11 @@ static bool whisper_decode_internal(

    //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);

-    struct ggml_init_params params;
-    params.mem_size   = wstate.buf_compute.size();
-    params.mem_buffer = wstate.buf_compute.data();
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.buf_compute.size(),
+        /*.mem_buffer =*/ wstate.buf_compute.data(),
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

@ -4726,6 +4736,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ false,
            };

            struct ggml_context * ctx0 = ggml_init(gparams);