move BLAS to a separate backend (cont) (llama/6210)

ggml-ci
2025-06-12 20:18:08 +00:00 · 2024-06-16 13:57:37 +03:00
parent 922971041b
commit de29b193f6
5 changed files with 773 additions and 1 deletions
--- a/examples/common.h
+++ b/examples/common.h
@ -21,7 +21,7 @@ struct gpt_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict    = 200;  // new tokens to predict
    int32_t n_parallel   = 1;    // number of parallel streams
-    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_batch      = 32;   // batch size for prompt processing
    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU