whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support * ggml : support for scratch ring-buffer * ggml : bug fix in ggml_repeat() * ggml : error on scratch buffer overflow * whisper : use scratch buffers during inference (base model only) * whisper : update memory usage for all models * whisper : fix encoder memory usage * whisper : use whisper_context functions instead of macros * whisper : fix FF + remove it from README * ggml : reuse ggml_new_i32 * ggml : refactor the scratch buffer storage * whisper : reorder scratch buffers in the decoder * main : add option to disable temp fallback * Update README.md
2025-06-19 07:18:07 +00:00 · 2023-02-04 09:45:52 +02:00
parent c306a7fd89
commit f3ee4a9673
7 changed files with 702 additions and 472 deletions
--- a/ggml.h
+++ b/ggml.h
@ -301,6 +301,13 @@ struct ggml_cgraph {
    int64_t perf_time_us;
 };

+// scratch buffer
+struct ggml_scratch {
+    size_t offs;
+    size_t size;
+    void * data;
+};
+
 struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
@ -327,6 +334,8 @@ void ggml_free(struct ggml_context * ctx);

 size_t ggml_used_mem(const struct ggml_context * ctx);

+size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,