build : fix after master merge

2025-03-11 15:04:06 +00:00 · 2023-09-06 14:27:51 +03:00 · 2023-09-06 14:27:51 +03:00 · b0306cd5cf
commit b0306cd5cf
parent afc84b35b0
2 changed files with 13 additions and 13 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,7 @@ add_library(${TARGET} STATIC
    common.cpp
    common-ggml.h
    common-ggml.cpp
+    grammar-parser.cpp
    )

 include(DefaultTargetOptions)
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@ -1164,7 +1164,7 @@ static bool llama_eval_internal(
    const llama_token * tokens,
            const int   n_tokens,
            const int   n_past,
-            const int   n_threads) {
+                  int   n_threads) {

    // enforce that the first token is BOS
    if (n_past == 0 && tokens[0] != llama_token_bos()) {
@ -1190,6 +1190,8 @@ static bool llama_eval_internal(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_embd/hparams.n_head;

+    const float eps = 5e-6f; // TODO: take from hparams
+
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;

@ -1204,7 +1206,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    ggml_set_name(embd, "embd");
@ -1221,7 +1223,7 @@ static bool llama_eval_internal(

        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, eps);

            // cur = cur*attention_norm(broadcasted)
            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
@ -1329,7 +1331,7 @@ static bool llama_eval_internal(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, eps);

                // cur = cur*ffn_norm(broadcasted)
                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
@ -1367,7 +1369,7 @@ static bool llama_eval_internal(
    // norm
    {

-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, eps);

        // inpL = inpL*norm(broadcasted)
        inpL = ggml_mul(ctx0, inpL, model.norm);
@ -1384,8 +1386,8 @@ static bool llama_eval_internal(
    //inpL = ggml_soft_max_inplace(ctx0, inpL);

    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_build_forward_expand  (&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
@ -2488,8 +2490,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            }

            struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+            ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);

            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -2635,7 +2636,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kout3d->data = out;
@ -2655,7 +2655,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);

            ggml_free(cpy_ctx);
        }
@ -2743,7 +2743,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kin3d->data = (void *) inp;
@ -2763,7 +2762,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);

            ggml_free(cpy_ctx);
        }