mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-04 01:49:06 +00:00
talk-llama : sync llama.cpp
Some checks failed
Bindings Tests (Ruby) / ubuntu-latest (push) Has been cancelled
CI / ubuntu-latest (linux/amd64) (push) Has been cancelled
CI / ubuntu-latest (linux/ppc64le) (push) Has been cancelled
CI / ubuntu-latest-arm64 (linux/arm64) (push) Has been cancelled
CI / ubuntu-latest-arm-v7 (linux/arm/v7) (push) Has been cancelled
CI / macOS-latest (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, ADDRESS) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, THREAD) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, UNDEFINED) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Has been cancelled
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Has been cancelled
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (Win32, ON, Release, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (x64, ON, Release, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 12.2.0, ON, 2.28.5) (push) Has been cancelled
CI / emscripten (Release) (push) Has been cancelled
CI / ios-xcode-build (Release) (push) Has been cancelled
CI / android (push) Has been cancelled
CI / quantize (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Has been cancelled
Some checks failed
Bindings Tests (Ruby) / ubuntu-latest (push) Has been cancelled
CI / ubuntu-latest (linux/amd64) (push) Has been cancelled
CI / ubuntu-latest (linux/ppc64le) (push) Has been cancelled
CI / ubuntu-latest-arm64 (linux/arm64) (push) Has been cancelled
CI / ubuntu-latest-arm-v7 (linux/arm/v7) (push) Has been cancelled
CI / macOS-latest (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, ADDRESS) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, THREAD) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, UNDEFINED) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Has been cancelled
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Has been cancelled
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (Win32, ON, Release, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (x64, ON, Release, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 12.2.0, ON, 2.28.5) (push) Has been cancelled
CI / emscripten (Release) (push) Has been cancelled
CI / ios-xcode-build (Release) (push) Has been cancelled
CI / android (push) Has been cancelled
CI / quantize (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Has been cancelled
This commit is contained in:
parent
19d95f9f9a
commit
99b011a9f5
@ -1,5 +1,7 @@
|
||||
#include "llama-adapter.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
@ -9,7 +11,7 @@
|
||||
|
||||
// vec
|
||||
|
||||
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
ggml_tensor * layer_dir = tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx, cur, layer_dir);
|
||||
@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
|
||||
return cur;
|
||||
}
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
bool llama_adapter_cvec::init(const llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
GGML_ASSERT(tensors.empty());
|
||||
GGML_ASSERT(ctxs.empty());
|
||||
GGML_ASSERT(bufs.empty());
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
}
|
||||
|
||||
ctx_map[buft] = ctx;
|
||||
cvec.ctxs.emplace_back(ctx);
|
||||
ctxs.emplace_back(ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
};
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.reserve(hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
tensors.reserve(hparams.n_layer);
|
||||
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
cvec.bufs.reserve(ctx_map.size());
|
||||
bufs.reserve(ctx_map.size());
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.bufs.emplace_back(buf);
|
||||
bufs.emplace_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_control_vector_apply(
|
||||
struct llama_control_vector & cvec,
|
||||
int32_t llama_adapter_cvec::apply(
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
|
||||
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
layer_start = -1;
|
||||
layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
if (tensors.empty()) {
|
||||
if (!init(model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
layer_start = il_start;
|
||||
layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
assert(tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
|
||||
|
||||
// lora
|
||||
|
||||
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
||||
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
|
||||
const std::string name(w->name);
|
||||
|
||||
const auto pos = ab_map.find(name);
|
||||
@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
||||
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
ggml_context * ctx_init;
|
||||
@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
};
|
||||
|
||||
// bundle lora_a and lora_b into pairs
|
||||
std::map<std::string, llama_lora_weight> ab_map;
|
||||
std::map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
};
|
||||
@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
if (str_endswith(name, ".lora_a")) {
|
||||
replace_all(name, ".lora_a", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(cur, nullptr);
|
||||
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
|
||||
} else {
|
||||
ab_map[name].a = cur;
|
||||
}
|
||||
} else if (str_endswith(name, ".lora_b")) {
|
||||
replace_all(name, ".lora_b", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(nullptr, cur);
|
||||
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
|
||||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
} else if (str_endswith(name, "_norm.weight")) {
|
||||
// TODO: add support for norm vector
|
||||
// for now, we don't really care because most adapters still work fine without it
|
||||
continue;
|
||||
} else {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||
}
|
||||
@ -250,25 +251,33 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
// add tensors
|
||||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
llama_adapter_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||
}
|
||||
|
||||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
const auto * model_tensor = model.get_tensor(name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||
// validate tensor shape
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
if (is_token_embd) {
|
||||
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
} else {
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
}
|
||||
}
|
||||
|
||||
// save tensor to adapter
|
||||
@ -276,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
||||
ggml_set_name(tensor_a, w.a->name);
|
||||
ggml_set_name(tensor_b, w.b->name);
|
||||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
||||
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_lora_adapter * adapter = new llama_lora_adapter();
|
||||
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
@ -1,66 +1,74 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// TODO: pimpl
|
||||
|
||||
//
|
||||
// llama_adapter_cvec
|
||||
//
|
||||
|
||||
// TODO: rename to llama_adapter_cvec
|
||||
struct llama_control_vector {
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
struct llama_adapter_cvec {
|
||||
struct ggml_tensor * tensor_for(int il) const;
|
||||
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
||||
|
||||
int32_t apply(
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
private:
|
||||
bool init(const llama_model & model);
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
struct ggml_tensor * tensor_for(int il) const;
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
};
|
||||
|
||||
int32_t llama_control_vector_apply(
|
||||
struct llama_control_vector & cvec,
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
//
|
||||
// llama_adapter_lora
|
||||
//
|
||||
|
||||
// TODO: rename to llama_adapter_lora_weight
|
||||
struct llama_lora_weight {
|
||||
struct llama_adapter_lora_weight {
|
||||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
// get actual scale based on rank and alpha
|
||||
float get_scale(float alpha, float adapter_scale) const {
|
||||
const float rank = (float) b->ne[0];
|
||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||
return scale;
|
||||
}
|
||||
|
||||
llama_adapter_lora_weight() = default;
|
||||
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
};
|
||||
|
||||
// TODO: rename to llama_adapter_lora
|
||||
struct llama_lora_adapter {
|
||||
struct llama_adapter_lora {
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
||||
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
float alpha;
|
||||
|
||||
llama_lora_adapter() = default;
|
||||
~llama_lora_adapter() = default;
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_lora_weight * get_weight(struct ggml_tensor * w);
|
||||
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
|
||||
};
|
||||
|
@ -27,6 +27,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
@ -56,6 +57,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@ -175,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
@ -584,6 +588,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PHIMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PLAMO,
|
||||
{
|
||||
@ -1144,6 +1169,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
||||
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
||||
@ -1161,6 +1187,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
||||
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
||||
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
||||
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
||||
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
||||
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GRANITE,
|
||||
{
|
||||
@ -1343,6 +1395,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
||||
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
@ -31,6 +31,7 @@ enum llm_arch {
|
||||
LLM_ARCH_QWEN2VL,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
@ -60,6 +61,7 @@ enum llm_arch {
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@ -109,6 +111,7 @@ enum llm_kv {
|
||||
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
||||
LLM_KV_RESIDUAL_SCALE,
|
||||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@ -173,6 +176,7 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
||||
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||
@ -252,6 +256,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_TIME_MIX_LERP_V,
|
||||
LLM_TENSOR_TIME_MIX_LERP_R,
|
||||
LLM_TENSOR_TIME_MIX_LERP_G,
|
||||
LLM_TENSOR_TIME_MIX_LERP_FUSED,
|
||||
LLM_TENSOR_TIME_MIX_FIRST,
|
||||
LLM_TENSOR_TIME_MIX_DECAY,
|
||||
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
||||
|
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
||||
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
||||
@ -73,7 +74,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return tmpl.find(haystack) != std::string::npos;
|
||||
};
|
||||
if (tmpl_contains("<|im_start|>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATML;
|
||||
return tmpl_contains("<|im_sep|>")
|
||||
? LLM_CHAT_TEMPLATE_PHI_4
|
||||
: LLM_CHAT_TEMPLATE_CHATML;
|
||||
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
||||
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
||||
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
||||
@ -269,6 +272,14 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
|
||||
// chatml template
|
||||
for (auto message : chat) {
|
||||
ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|im_start|>assistant<|im_sep|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
||||
// Falcon 3
|
||||
for (auto message : chat) {
|
||||
|
@ -15,6 +15,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||
LLM_CHAT_TEMPLATE_PHI_3,
|
||||
LLM_CHAT_TEMPLATE_PHI_4,
|
||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||
LLM_CHAT_TEMPLATE_ZEPHYR,
|
||||
LLM_CHAT_TEMPLATE_MONARCH,
|
||||
|
@ -1,5 +1,8 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
@ -467,11 +470,12 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
||||
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
const auto & cparams = lctx.cparams;
|
||||
const auto & hparams = lctx.model.hparams;
|
||||
const auto & vocab = lctx.model.vocab;
|
||||
|
||||
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = hparams.n_vocab;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@ -504,7 +508,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||
auto * output_dev = lctx.model.dev_output.dev;
|
||||
auto * output_dev = lctx.model.dev_output();
|
||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||
if (output_dev_host_buft) {
|
||||
buft = output_dev_host_buft;
|
||||
@ -538,7 +542,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
void llama_output_reorder(struct llama_context & ctx) {
|
||||
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
const uint32_t n_vocab = ctx.model.hparams.n_vocab;
|
||||
const uint32_t n_vocab = ctx.model.vocab.n_tokens();
|
||||
const uint32_t n_embd = ctx.model.hparams.n_embd;
|
||||
|
||||
const int32_t n_outputs = ctx.n_outputs;
|
||||
@ -722,7 +726,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
||||
}
|
||||
|
||||
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
||||
return ctx->logits + j*ctx->model.vocab.n_tokens();
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@ -882,7 +886,7 @@ struct llama_data_write {
|
||||
}
|
||||
|
||||
void write_logits(const struct llama_context * ctx) {
|
||||
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
|
||||
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
|
||||
|
||||
write(&logits_size, sizeof(logits_size));
|
||||
|
||||
|
@ -22,12 +22,12 @@ struct llama_context {
|
||||
|
||||
const struct llama_model & model;
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sbatch sbatch; // TODO: revisit if needed
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sbatch sbatch; // TODO: revisit if needed
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_adapter_cvec cvec;
|
||||
|
||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
||||
std::unordered_map<struct llama_adapter_lora *, float> lora;
|
||||
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||
|
@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const llama_token id = cur_p->data[i].id;
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
||||
const std::string & piece = grammar.vocab->token_to_piece(id);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
||||
if (grammar.vocab->is_eog(id)) {
|
||||
if (!allow_eog) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||
GGML_ASSERT(grammar.vocab != nullptr);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
|
||||
if (grammar.vocab->is_eog(token)) {
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
|
||||
const std::string & piece = grammar.vocab->token_to_piece(token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
|
@ -52,7 +52,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
uint32_t llama_hparams::n_embd_k_s() const {
|
||||
if (wkv_head_size != 0) {
|
||||
// for RWKV models
|
||||
return 2 * n_embd;
|
||||
return token_shift_count * n_embd;
|
||||
}
|
||||
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
|
@ -30,7 +30,6 @@ struct llama_hparams {
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
|
||||
uint32_t n_vocab = 0;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_features = 0;
|
||||
@ -41,7 +40,6 @@ struct llama_hparams {
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_vocab_type = 0; // for BERT-style token types
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
@ -76,6 +74,7 @@ struct llama_hparams {
|
||||
uint32_t time_mix_extra_dim = 0;
|
||||
uint32_t time_decay_extra_dim = 0;
|
||||
uint32_t wkv_head_size = 0;
|
||||
uint32_t token_shift_count = 2;
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
|
@ -79,7 +79,7 @@ bool llama_kv_cache_init(
|
||||
|
||||
ggml_backend_buffer_type_t buft;
|
||||
if (offload) {
|
||||
auto * dev = model.dev_layer.at(i).dev;
|
||||
auto * dev = model.dev_layer(i);
|
||||
buft = ggml_backend_dev_buffer_type(dev);
|
||||
} else {
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
|
@ -35,7 +35,7 @@
|
||||
|
||||
// TODO: consider moving to llama-impl.h if needed in more places
|
||||
#if defined(_WIN32)
|
||||
std::string llama_format_win_err(DWORD err) {
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
|
@ -7,6 +7,10 @@
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
@ -17,8 +21,51 @@ const char * llama_file_version_name(llama_fver version) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
||||
namespace GGUFMeta {
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||
struct GKV_Base_Type {
|
||||
static constexpr gguf_type gt = gt_;
|
||||
|
||||
@ -60,10 +107,11 @@ namespace GGUFMeta {
|
||||
public:
|
||||
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
||||
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
||||
return ArrayInfo {
|
||||
gguf_get_arr_type(ctx, k),
|
||||
arr_type,
|
||||
size_t(gguf_get_arr_n(ctx, k)),
|
||||
gguf_get_arr_data(ctx, k),
|
||||
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
||||
};
|
||||
}
|
||||
};
|
||||
@ -553,7 +601,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
@ -1008,3 +1056,17 @@ bool llama_model_loader::load_all_data(
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::ftype_name() const {
|
||||
return llama_model_ftype_name(ftype);
|
||||
}
|
||||
|
||||
void llama_model_loader::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
||||
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
||||
if (n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
}
|
||||
}
|
||||
|
@ -155,4 +155,8 @@ struct llama_model_loader {
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data);
|
||||
|
||||
std::string ftype_name() const;
|
||||
|
||||
void print_info() const;
|
||||
};
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -4,78 +4,80 @@
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
enum llm_type {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_60M,
|
||||
MODEL_70M,
|
||||
MODEL_80M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_220M,
|
||||
MODEL_250M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_450M,
|
||||
MODEL_770M,
|
||||
MODEL_780M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_5B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_9B,
|
||||
MODEL_11B,
|
||||
MODEL_12B,
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_A1_7B,
|
||||
MODEL_A2_7B,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_27B,
|
||||
LLM_TYPE_UNKNOWN,
|
||||
LLM_TYPE_14M,
|
||||
LLM_TYPE_17M,
|
||||
LLM_TYPE_22M,
|
||||
LLM_TYPE_33M,
|
||||
LLM_TYPE_60M,
|
||||
LLM_TYPE_70M,
|
||||
LLM_TYPE_80M,
|
||||
LLM_TYPE_109M,
|
||||
LLM_TYPE_137M,
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_220M,
|
||||
LLM_TYPE_250M,
|
||||
LLM_TYPE_270M,
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_3B,
|
||||
LLM_TYPE_1_4B,
|
||||
LLM_TYPE_1_5B,
|
||||
LLM_TYPE_1_6B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_8B,
|
||||
LLM_TYPE_3B,
|
||||
LLM_TYPE_4B,
|
||||
LLM_TYPE_6B,
|
||||
LLM_TYPE_6_9B,
|
||||
LLM_TYPE_7B,
|
||||
LLM_TYPE_8B,
|
||||
LLM_TYPE_9B,
|
||||
LLM_TYPE_11B,
|
||||
LLM_TYPE_12B,
|
||||
LLM_TYPE_13B,
|
||||
LLM_TYPE_14B,
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
LLM_TYPE_35B,
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
LLM_TYPE_MEDIUM,
|
||||
LLM_TYPE_LARGE,
|
||||
LLM_TYPE_XL,
|
||||
LLM_TYPE_A1_7B,
|
||||
LLM_TYPE_A2_7B,
|
||||
LLM_TYPE_8x7B,
|
||||
LLM_TYPE_8x22B,
|
||||
LLM_TYPE_16x12B,
|
||||
LLM_TYPE_16x3_8B,
|
||||
LLM_TYPE_10B_128x3_66B,
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
@ -240,15 +242,19 @@ struct llama_layer {
|
||||
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_fused = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_key_b = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_value_b = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance_b = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_ln = nullptr;
|
||||
struct ggml_tensor * time_mix_ln_b = nullptr;
|
||||
@ -281,11 +287,9 @@ struct llama_layer {
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
llm_type type = MODEL_UNKNOWN;
|
||||
llm_type type = LLM_TYPE_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams = {};
|
||||
@ -314,78 +318,55 @@ struct llama_model {
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
llama_model_params params;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// list of devices used in this model
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
buft_list_t cpu_buft_list;
|
||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||
|
||||
struct layer_dev {
|
||||
ggml_backend_dev_t dev;
|
||||
buft_list_t * buft_list;
|
||||
};
|
||||
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
// contexts where the model tensors metadata is stored
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
explicit llama_model(const struct llama_model_params & params);
|
||||
~llama_model();
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
void load_stats (llama_model_loader & ml);
|
||||
void load_arch (llama_model_loader & ml);
|
||||
void load_hparams(llama_model_loader & ml);
|
||||
void load_vocab (llama_model_loader & ml);
|
||||
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
||||
|
||||
std::string arch_name() const;
|
||||
std::string type_name() const;
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t max_nodes() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements() const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
ggml_backend_dev_t dev_layer(int il) const;
|
||||
ggml_backend_dev_t dev_output() const;
|
||||
|
||||
ggml_backend_buffer_type_t select_buft(int il) const;
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
const char * llm_type_name(llm_type type);
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
// used by llama_adapter_lora
|
||||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
||||
|
||||
size_t llama_model_max_nodes(const llama_model & model);
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// TODO: become llama_model methods
|
||||
void llm_load_stats (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_arch (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
|
||||
|
@ -7,14 +7,12 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// TODO: replace with ggml API call
|
||||
#define QK_K 256
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
} else {
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
if (qs.model.type == MODEL_70B) {
|
||||
if (qs.model.type == LLM_TYPE_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
||||
new_type == GGML_TYPE_IQ1_M) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||
{
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t ny = tensor->ne[1];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (nx % qk_k != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||
convert_incompatible_tensor = true;
|
||||
} else {
|
||||
++qs.n_k_quantized;
|
||||
}
|
||||
}
|
||||
|
||||
if (convert_incompatible_tensor) {
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_TQ1_0:
|
||||
@ -526,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_stats (ml, model);
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
model.load_arch (ml);
|
||||
model.load_hparams(ml);
|
||||
model.load_stats (ml);
|
||||
|
||||
struct quantize_state_impl qs(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
ftype = ml.ftype;
|
||||
}
|
||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||
if (params->imatrix) {
|
||||
@ -621,7 +622,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks
|
||||
// sanity checks for models that have attention layers
|
||||
if (qs.n_attention_wv != 0)
|
||||
{
|
||||
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
||||
// attention layers have a non-zero number of kv heads
|
||||
@ -759,6 +761,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
||||
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
@ -875,7 +878,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
|
@ -371,7 +371,10 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// TODO: do not allocate each time
|
||||
std::vector<llama_token_data> cur;
|
||||
@ -1445,7 +1448,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
||||
|
||||
auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
|
||||
auto * result = llama_sampler_init_grammar(ctx->vocab, nullptr, nullptr);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
@ -1481,19 +1484,19 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
|
||||
/* .free = */ llama_sampler_grammar_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
|
||||
struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
|
||||
auto * ctx = new llama_sampler_grammar;
|
||||
|
||||
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
||||
*ctx = {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
/* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
|
||||
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
|
||||
};
|
||||
} else {
|
||||
*ctx = {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ {},
|
||||
/* .grammar_root = */ {},
|
||||
/* .grammar = */ nullptr,
|
||||
@ -1663,8 +1666,8 @@ struct llama_sampler_dry {
|
||||
|
||||
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
||||
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
||||
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
|
||||
std::string word = llama_detokenize(vocab, {token_id}, true);
|
||||
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
|
||||
std::string word = vocab.detokenize({token_id}, true);
|
||||
if (word.find(str) != std::string::npos) {
|
||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||
} else {
|
||||
@ -1681,7 +1684,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
|
||||
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
|
||||
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
||||
tokenization.resize(max_tail_len);
|
||||
}
|
||||
@ -1937,7 +1940,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
|
||||
llama_vocab dummy_vocab;
|
||||
|
||||
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
|
||||
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
||||
auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
||||
|
||||
// Copy the state, including the processed breakers
|
||||
{
|
||||
@ -1964,7 +1967,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
||||
/* .free = */ llama_sampler_dry_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
||||
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
||||
const int MAX_CHAR_LEN = 40;
|
||||
@ -1991,7 +1994,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
||||
sequence_break.resize(MAX_CHAR_LEN);
|
||||
}
|
||||
|
||||
get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
||||
get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2014,7 +2017,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
||||
// wrapper for test-sampling.cpp
|
||||
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
|
||||
llama_vocab dummy_vocab;
|
||||
auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
|
||||
auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
|
||||
auto * ctx = (llama_sampler_dry *) result->ctx;
|
||||
|
||||
// Process the token-based sequence breakers
|
||||
@ -2153,7 +2156,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_eog_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_eog_sum += cur_p->data[i].p;
|
||||
} else {
|
||||
p_txt_sum += cur_p->data[i].p;
|
||||
@ -2175,7 +2178,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
@ -2203,17 +2206,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
continue;
|
||||
}
|
||||
|
||||
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
if (len0 < 0) {
|
||||
ctx->buf0.resize(len0);
|
||||
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
assert(len0 > 0);
|
||||
}
|
||||
|
||||
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
if (len1 < 0) {
|
||||
ctx->buf1.resize(len1);
|
||||
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
assert(len1 > 0);
|
||||
}
|
||||
|
||||
@ -2248,7 +2251,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
@ -2269,7 +2272,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||
if (n_non_eog == 0) {
|
||||
cur_p->size = 1;
|
||||
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
||||
cur_p->data[0].id = ctx->vocab->token_eot();
|
||||
cur_p->data[0].logit = 1.0f;
|
||||
|
||||
return;
|
||||
@ -2291,7 +2294,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
@ -2314,7 +2317,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
|
||||
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
||||
return llama_sampler_init_infill_impl(*ctx->vocab);
|
||||
return llama_sampler_init_infill(ctx->vocab);
|
||||
}
|
||||
|
||||
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
||||
@ -2330,14 +2333,13 @@ static struct llama_sampler_i llama_sampler_infill_i = {
|
||||
/* .free = */ llama_sampler_infill_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab) {
|
||||
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_infill_i,
|
||||
/* .ctx = */ new llama_sampler_infill {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .buf0 = */ std::vector<char>(512),
|
||||
/* .buf1 = */ std::vector<char>(512),
|
||||
/* .vocab = */ vocab,
|
||||
/* .buf0 = */ std::vector<char>(512),
|
||||
/* .buf1 = */ std::vector<char>(512),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
@ -2,7 +2,9 @@
|
||||
|
||||
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
|
||||
|
||||
#include "llama-grammar.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_grammar;
|
||||
@ -21,24 +23,6 @@ struct llama_sampler_chain {
|
||||
mutable int32_t n_sample;
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
float dry_base,
|
||||
int32_t dry_allowed_length,
|
||||
int32_t dry_penalty_last_n,
|
||||
const char ** seq_breakers,
|
||||
size_t num_breakers);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_testing(
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -4,179 +4,122 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct llm_tokenizer;
|
||||
struct LLM_KV;
|
||||
struct llama_model_loader;
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
std::string text;
|
||||
float score;
|
||||
llama_token_attr attr;
|
||||
};
|
||||
|
||||
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_eot_id = LLAMA_TOKEN_NULL;
|
||||
id special_eom_id = LLAMA_TOKEN_NULL;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
id linefeed_id = 13;
|
||||
|
||||
// fim tokens
|
||||
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||
|
||||
// set of all tokens that cause "end of generation"
|
||||
std::set<id> special_eog_ids;
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
llm_tokenizer * tokenizer = nullptr;
|
||||
|
||||
llama_vocab() = default;
|
||||
llama_vocab();
|
||||
~llama_vocab();
|
||||
|
||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||
|
||||
enum llama_vocab_type get_type() const;
|
||||
enum llama_vocab_pre_type get_pre_type() const;
|
||||
|
||||
uint32_t n_tokens() const;
|
||||
uint32_t n_token_types() const;
|
||||
|
||||
std::string type_name() const;
|
||||
|
||||
bool is_normal (llama_token id) const;
|
||||
bool is_unknown (llama_token id) const;
|
||||
bool is_control (llama_token id) const;
|
||||
bool is_byte (llama_token id) const;
|
||||
bool is_user_defined(llama_token id) const;
|
||||
bool is_unused (llama_token id) const;
|
||||
bool is_eog (llama_token id) const;
|
||||
|
||||
uint8_t token_to_byte(llama_token id) const;
|
||||
llama_token byte_to_token(uint8_t ch) const;
|
||||
|
||||
llama_token text_to_token(const std::string & text) const;
|
||||
|
||||
const token_data & get_token_data(llama_token id) const;
|
||||
|
||||
const char * token_get_text (llama_token id) const;
|
||||
float token_get_score(llama_token id) const;
|
||||
llama_token_attr token_get_attr (llama_token id) const;
|
||||
|
||||
llama_token token_bos() const;
|
||||
llama_token token_eos() const;
|
||||
llama_token token_eot() const;
|
||||
llama_token token_eom() const;
|
||||
llama_token token_unk() const;
|
||||
llama_token token_sep() const;
|
||||
llama_token token_nl () const;
|
||||
llama_token token_pad() const;
|
||||
|
||||
llama_token token_prefix() const;
|
||||
llama_token token_middle() const;
|
||||
llama_token token_suffix() const;
|
||||
|
||||
llama_token token_fim_pre() const;
|
||||
llama_token token_fim_suf() const;
|
||||
llama_token token_fim_mid() const;
|
||||
llama_token token_fim_pad() const;
|
||||
llama_token token_fim_rep() const;
|
||||
llama_token token_fim_sep() const;
|
||||
|
||||
bool get_add_space_prefix () const;
|
||||
bool get_add_bos () const;
|
||||
bool get_add_eos () const;
|
||||
bool get_ignore_merges () const;
|
||||
bool get_clean_spaces () const;
|
||||
bool get_remove_extra_whitespaces () const;
|
||||
bool get_escape_whitespaces () const;
|
||||
bool get_treat_whitespace_as_suffix() const;
|
||||
|
||||
int max_token_len() const;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
|
||||
void init_tokenizer();
|
||||
int32_t tokenize(
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special) const;
|
||||
|
||||
std::vector<llama_token> tokenize(
|
||||
const std::string & raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false) const;
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t token_to_piece(
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special) const;
|
||||
|
||||
// use cached data
|
||||
const std::string & token_to_piece(llama_token token) const;
|
||||
|
||||
int32_t detokenize(
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special) const;
|
||||
|
||||
std::string detokenize(
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special) const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
// TODO: rename to llama_tokenize_impl
|
||||
// TODO: This should probably be in llama.h
|
||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab,
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// TODO: move the API below as member functions of llama_vocab
|
||||
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
||||
|
||||
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
// check if token0 is contained as a prefix in token1
|
||||
bool llama_token_is_prefix_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token0,
|
||||
llama_token token1);
|
||||
|
||||
int32_t llama_detokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special);
|
||||
|
||||
std::string llama_detokenize(
|
||||
const struct llama_vocab & vocab,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -56,7 +56,7 @@ extern "C" {
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
// struct llama_vocab; // TODO: add in the future
|
||||
struct llama_vocab;
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
struct llama_sampler;
|
||||
@ -385,8 +385,7 @@ extern "C" {
|
||||
} llama_chat_message;
|
||||
|
||||
// lora adapter
|
||||
// TODO: rename to llama_adapter_lora
|
||||
struct llama_lora_adapter;
|
||||
struct llama_adapter_lora;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
||||
@ -400,18 +399,19 @@ extern "C" {
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(void);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
//optional:
|
||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||
|
||||
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
||||
LLAMA_API void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch);
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
@ -427,11 +427,15 @@ extern "C" {
|
||||
|
||||
LLAMA_API void llama_model_free(struct llama_model * model);
|
||||
|
||||
// TODO: rename to llama_init_from_model
|
||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_API struct llama_context * llama_init_from_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
@ -449,20 +453,30 @@ extern "C" {
|
||||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
|
||||
|
||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
||||
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
||||
|
||||
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
||||
|
||||
// Functions to access the model's GGUF metadata scalar values
|
||||
// - The functions return the length of the string on success, or -1 on failure
|
||||
@ -488,6 +502,9 @@ extern "C" {
|
||||
// Returns the total size of all the tensors in the model in bytes
|
||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||
|
||||
// Get the default chat template. Returns nullptr if not available
|
||||
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
|
||||
|
||||
// Returns the total number of parameters in the model
|
||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||
|
||||
@ -515,34 +532,31 @@ extern "C" {
|
||||
//
|
||||
|
||||
// Load a LoRA adapter from file
|
||||
// TODO: rename to llama_adapter_lora_init
|
||||
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
// This will not modify model's weight
|
||||
// TODO: rename to llama_set_adapter_lora
|
||||
LLAMA_API int32_t llama_lora_adapter_set(
|
||||
LLAMA_API int32_t llama_set_adapter_lora(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter,
|
||||
struct llama_adapter_lora * adapter,
|
||||
float scale);
|
||||
|
||||
// Remove a specific LoRA adapter from given context
|
||||
// Return -1 if the adapter is not present in the context
|
||||
// TODO: rename to llama_rm_adapter_lora
|
||||
LLAMA_API int32_t llama_lora_adapter_remove(
|
||||
LLAMA_API int32_t llama_rm_adapter_lora(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter);
|
||||
struct llama_adapter_lora * adapter);
|
||||
|
||||
// Remove all LoRA adapters from given context
|
||||
// TODO: rename to llama_clear_adapter_lora
|
||||
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
// TODO: rename to llama_adapter_lora_free
|
||||
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
||||
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
@ -550,9 +564,8 @@ extern "C" {
|
||||
// to an n_embd x n_layers buffer starting from layer 1.
|
||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
// See llama_control_vector_load in common to load a control vector.
|
||||
// TODO: rename to llama_adapter_cvec_apply
|
||||
LLAMA_API int32_t llama_control_vector_apply(
|
||||
struct llama_context * lctx,
|
||||
LLAMA_API int32_t llama_apply_adapter_cvec(
|
||||
struct llama_context * ctx,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
@ -908,41 +921,60 @@ extern "C" {
|
||||
// Vocab
|
||||
//
|
||||
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
|
||||
|
||||
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
|
||||
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||
LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
|
||||
LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
|
||||
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
||||
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
||||
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
||||
|
||||
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
||||
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
||||
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
||||
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
||||
|
||||
// infill tokens
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
||||
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
||||
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
||||
LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
|
||||
LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
|
||||
LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
|
||||
LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
|
||||
|
||||
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
||||
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
||||
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
||||
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
||||
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
||||
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
||||
DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocabable_get_text instead");
|
||||
DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
|
||||
DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
|
||||
DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
|
||||
DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
|
||||
DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
|
||||
DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
|
||||
DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
|
||||
|
||||
// CLS is equivalent to BOS
|
||||
DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
|
||||
"use llama_vocab_bos instead");
|
||||
|
||||
//
|
||||
// Tokenization
|
||||
@ -958,7 +990,7 @@ extern "C" {
|
||||
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||
/// as plaintext. Does not insert a leading space.
|
||||
LLAMA_API int32_t llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
@ -972,7 +1004,7 @@ extern "C" {
|
||||
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
|
||||
// @param special If true, special tokens are rendered in the output.
|
||||
LLAMA_API int32_t llama_token_to_piece(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
@ -986,7 +1018,7 @@ extern "C" {
|
||||
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
|
||||
/// @param unparse_special If true, special tokens are rendered in the output.
|
||||
LLAMA_API int32_t llama_detokenize(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
@ -1009,7 +1041,6 @@ extern "C" {
|
||||
/// @param length The size of the allocated buffer
|
||||
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||
LLAMA_API int32_t llama_chat_apply_template(
|
||||
const struct llama_model * model,
|
||||
const char * tmpl,
|
||||
const struct llama_chat_message * chat,
|
||||
size_t n_msg,
|
||||
@ -1057,7 +1088,6 @@ extern "C" {
|
||||
// llama_sampler_free(smpl);
|
||||
//
|
||||
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
||||
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
|
||||
//
|
||||
|
||||
typedef void * llama_sampler_context_t;
|
||||
@ -1157,7 +1187,7 @@ extern "C" {
|
||||
float eta);
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
@ -1169,8 +1199,9 @@ extern "C" {
|
||||
float penalty_present); // 0.0 = disabled
|
||||
|
||||
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
||||
const struct llama_model * model,
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
||||
const struct llama_vocab * vocab,
|
||||
int32_t n_ctx_train,
|
||||
float dry_multiplier,
|
||||
float dry_base,
|
||||
int32_t dry_allowed_length,
|
||||
@ -1204,7 +1235,7 @@ extern "C" {
|
||||
// 3. discard non-EOG tokens with low prob
|
||||
// 4. if no tokens are left -> pick EOT
|
||||
//
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
|
||||
|
||||
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||
|
@ -17,15 +17,16 @@
|
||||
#include <sstream>
|
||||
|
||||
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
auto * model = llama_get_model(ctx);
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + add_bos;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
@ -34,11 +35,14 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
|
||||
}
|
||||
|
||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
|
||||
const int n_tokens = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
|
||||
int check = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
@ -310,6 +314,8 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab_llama = llama_model_get_vocab(model_llama);
|
||||
|
||||
llama_context_params lcparams = llama_context_default_params();
|
||||
|
||||
// tune these to your liking
|
||||
@ -317,7 +323,7 @@ int main(int argc, char ** argv) {
|
||||
lcparams.n_threads = params.n_threads;
|
||||
lcparams.flash_attn = params.flash_attn;
|
||||
|
||||
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
||||
struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
@ -727,7 +733,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
|
||||
|
||||
if (id != llama_token_eos(model_llama)) {
|
||||
if (id != llama_vocab_eos(vocab_llama)) {
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user