talk-llama : sync llama.cpp (#2709)

2025-06-12 20:18:08 +00:00 · 2025-01-13 08:55:48 +02:00
parent 45d3faf961
commit 35d0e02c72
36 changed files with 12960 additions and 11341 deletions
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@ -1,10 +1,9 @@
 #pragma once

-#include "llama.h"
+#include "ggml.h" // for ggml_log_level

 #include <string>
 #include <vector>
-#include <stdexcept>

 #ifdef __GNUC__
 #ifdef __MINGW32__
@ -35,147 +34,28 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 // helpers
 //

-struct time_meas {
-    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+template <typename T>
+struct no_init {
+    T value;
+    no_init() { /* do nothing */ }
+};

-    ~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
-    }
+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false);
+    ~time_meas();

    const int64_t t_start_us;

    int64_t & t_acc;
 };

-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    if (search.empty()) {
-        return;
-    }
-    std::string builder;
-    builder.reserve(s.length());
-    size_t pos = 0;
-    size_t last_pos = 0;
-    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        builder.append(s, last_pos, pos - last_pos);
-        builder.append(replace);
-        last_pos = pos + search.length();
-    }
-    builder.append(s, last_pos, std::string::npos);
-    s = std::move(builder);
-}
+void replace_all(std::string & s, const std::string & search, const std::string & replace);

-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
+// TODO: rename to llama_format ?
+LLAMA_ATTRIBUTE_FORMAT(1, 2)
+std::string format(const char * fmt, ...);

-// the ring buffer works similarly to std::deque, but with a fixed capacity
-template<typename T>
-struct ring_buffer {
-    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
+std::string llama_format_tensor_shape(const struct ggml_tensor * t);

-    T & front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    const T & front() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    T & back() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    const T & back() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    void push_back(const T & value) {
-        if (capacity == 0) {
-            throw std::runtime_error("ring buffer: capacity is zero");
-        }
-
-        if (sz == capacity) {
-            // advance the start when buffer is full
-            first = (first + 1) % capacity;
-        } else {
-            sz++;
-        }
-        data[pos] = value;
-        pos = (pos + 1) % capacity;
-    }
-
-    T pop_front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        T value = data[first];
-        first = (first + 1) % capacity;
-        sz--;
-        return value;
-    }
-
-    //T & operator[](size_t i) {
-    //    if (i >= sz) {
-    //        throw std::runtime_error("ring buffer: index out of bounds");
-    //    }
-    //    return data[(first + i) % capacity];
-    //}
-
-    //const T & at(size_t i) const {
-    //    if (i >= sz) {
-    //        throw std::runtime_error("ring buffer: index out of bounds");
-    //    }
-    //    return data[(first + i) % capacity];
-    //}
-
-    const T & rat(size_t i) const {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + sz - i - 1) % capacity];
-    }
-
-    std::vector<T> to_vector() const {
-        std::vector<T> result;
-        result.reserve(sz);
-        for (size_t i = 0; i < sz; i++) {
-            result.push_back(data[(first + i) % capacity]);
-        }
-        return result;
-    }
-
-    void clear() {
-        // here only reset the status of the buffer
-        sz = 0;
-        first = 0;
-        pos = 0;
-    }
-
-    bool empty() const {
-        return sz == 0;
-    }
-
-    size_t size() const {
-        return sz;
-    }
-
-    size_t capacity = 0;
-    size_t sz = 0;
-    size_t first = 0;
-    size_t pos = 0;
-    std::vector<T> data;
-};
+std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);