talk-llama : sync latest llama.cpp (close #922, close #954)

This commit is contained in:
Georgi Gerganov 2023-05-23 14:04:39 +03:00
parent 041be06d58
commit 77eab3fbfe
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 287 additions and 167 deletions

View File

@ -14,6 +14,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <stdexcept>
#ifdef __has_include #ifdef __has_include
#if __has_include(<unistd.h>) #if __has_include(<unistd.h>)
@ -74,7 +75,7 @@ struct llama_file {
llama_file(const char * fname, const char * mode) { llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode); fp = std::fopen(fname, mode);
if (fp == NULL) { if (fp == NULL) {
throw format("failed to open %s: %s", fname, std::strerror(errno)); throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
} }
seek(0, SEEK_END); seek(0, SEEK_END);
size = tell(); size = tell();
@ -100,17 +101,17 @@ struct llama_file {
LLAMA_ASSERT(ret == 0); // same LLAMA_ASSERT(ret == 0); // same
} }
void read_raw(void * ptr, size_t size) { void read_raw(void * ptr, size_t len) const {
if (size == 0) { if (len == 0) {
return; return;
} }
errno = 0; errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp); std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) { if (ferror(fp)) {
throw format("read error: %s", strerror(errno)); throw std::runtime_error(format("read error: %s", strerror(errno)));
} }
if (ret != 1) { if (ret != 1) {
throw std::string("unexpectedly reached end of file"); throw std::runtime_error(std::string("unexpectedly reached end of file"));
} }
} }
@ -126,14 +127,14 @@ struct llama_file {
return std::string(chars.data(), len); return std::string(chars.data(), len);
} }
void write_raw(const void * ptr, size_t size) { void write_raw(const void * ptr, size_t len) const {
if (size == 0) { if (len == 0) {
return; return;
} }
errno = 0; errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp); size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) { if (ret != 1) {
throw format("write error: %s", strerror(errno)); throw std::runtime_error(format("write error: %s", strerror(errno)));
} }
} }
@ -171,7 +172,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES #ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_SHARED; int flags = MAP_SHARED;
@ -180,12 +181,12 @@ struct llama_mmap {
#endif #endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
throw format("mmap failed: %s", strerror(errno)); throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
} }
if (prefetch) { if (prefetch > 0) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) { if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }
@ -207,7 +208,7 @@ struct llama_mmap {
DWORD error = GetLastError(); DWORD error = GetLastError();
if (hMapping == NULL) { if (hMapping == NULL) {
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
} }
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@ -215,7 +216,7 @@ struct llama_mmap {
CloseHandle(hMapping); CloseHandle(hMapping);
if (addr == NULL) { if (addr == NULL) {
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
} }
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@ -243,8 +244,9 @@ struct llama_mmap {
#else #else
static constexpr bool SUPPORTED = false; static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) { llama_mmap(struct llama_file *, bool prefetch = true) {
throw std::string("mmap not supported"); (void)prefetch;
throw std::runtime_error(std::string("mmap not supported"));
} }
#endif #endif
}; };
@ -265,9 +267,9 @@ struct llama_mlock {
} }
} }
void init(void * addr) { void init(void * ptr) {
LLAMA_ASSERT(this->addr == NULL && this->size == 0); LLAMA_ASSERT(addr == NULL && size == 0);
this->addr = addr; addr = ptr;
} }
void grow_to(size_t target_size) { void grow_to(size_t target_size) {
@ -338,14 +340,14 @@ struct llama_mlock {
return (size_t) si.dwPageSize; return (size_t) si.dwPageSize;
} }
bool raw_lock(void * addr, size_t size) { bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) { for (int tries = 1; ; tries++) {
if (VirtualLock(addr, size)) { if (VirtualLock(ptr, len)) {
return true; return true;
} }
if (tries == 2) { if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
size, this->size, llama_format_win_err(GetLastError()).c_str()); len, size, llama_format_win_err(GetLastError()).c_str());
return false; return false;
} }
@ -361,7 +363,7 @@ struct llama_mlock {
// is equal to the number of pages in its minimum working set minus // is equal to the number of pages in its minimum working set minus
// a small overhead." // a small overhead."
// Hopefully a megabyte is enough overhead: // Hopefully a megabyte is enough overhead:
size_t increment = size + 1048576; size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both: // The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment; min_ws_size += increment;
max_ws_size += increment; max_ws_size += increment;
@ -373,8 +375,8 @@ struct llama_mlock {
} }
} }
void raw_unlock(void * addr, size_t size) { void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(addr, size)) { if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
} }
@ -382,11 +384,16 @@ struct llama_mlock {
#else #else
static constexpr bool SUPPORTED = false; static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) { size_t lock_granularity() {
fprintf(stderr, "warning: mlock not supported on this system\n"); return (size_t) 65536;
} }
void raw_unlock(const void * addr, size_t size) {} bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif #endif
}; };
@ -395,36 +402,70 @@ struct llama_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;
size_t size = 0; size_t size = 0;
void resize(size_t size) { llama_buffer() = default;
void resize(size_t len) {
delete[] addr; delete[] addr;
addr = new uint8_t[size]; addr = new uint8_t[len];
this->size = size; size = len;
} }
~llama_buffer() { ~llama_buffer() {
delete[] addr; delete[] addr;
} }
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
}; };
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
struct llama_ctx_buffer { struct llama_ctx_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0; size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) { void resize(size_t size) {
if (addr) { free();
ggml_cuda_host_free(addr);
}
addr = (uint8_t *) ggml_cuda_host_malloc(size); addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size; this->size = size;
} }
~llama_ctx_buffer() { void free() {
if (addr) { if (addr) {
ggml_cuda_host_free(addr); if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
} }
addr = NULL;
} }
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
}; };
#else #else
typedef llama_buffer llama_ctx_buffer; typedef llama_buffer llama_ctx_buffer;

View File

@ -1,6 +1,7 @@
// Defines fileno on msys: // Defines fileno on msys:
#ifndef _GNU_SOURCE #ifndef _GNU_SOURCE
#define _GNU_SOURCE #define _GNU_SOURCE
#include <cstddef>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#endif #endif
@ -45,6 +46,7 @@ enum e_model {
MODEL_65B, MODEL_65B,
}; };
static const size_t MB = 1024*1024; static const size_t MB = 1024*1024;
// computed for n_ctx == 2048 // computed for n_ctx == 2048
@ -110,7 +112,7 @@ struct llama_hparams {
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
return memcmp(this, &other, sizeof(llama_hparams)); return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
} }
}; };
@ -406,6 +408,7 @@ enum llama_file_version {
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
}; };
struct llama_file_loader { struct llama_file_loader {
@ -424,24 +427,30 @@ struct llama_file_loader {
} }
void read_magic() { void read_magic() {
uint32_t magic = file.read_u32(); uint32_t magic = file.read_u32();
uint32_t version = 0;
if (magic != 'ggml') { if (magic == LLAMA_FILE_MAGIC_GGML) {
version = file.read_u32();
}
if (magic == 'ggml' && version == 0) {
file_version = LLAMA_FILE_VERSION_GGML; file_version = LLAMA_FILE_VERSION_GGML;
} else if (magic == 'ggmf' && version == 1) { return;
file_version = LLAMA_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
} }
uint32_t version = file.read_u32();
switch (magic) {
case LLAMA_FILE_MAGIC_GGMF:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
}
break;
case LLAMA_FILE_MAGIC_GGJT:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
}
}
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
} }
void read_hparams() { void read_hparams() {
hparams.n_vocab = file.read_u32(); hparams.n_vocab = file.read_u32();
@ -499,7 +508,7 @@ struct llama_file_loader {
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
// skip to the next multiple of 32 bytes // skip to the next multiple of 32 bytes
file.seek(-file.tell() & 31, SEEK_CUR); file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
} }
shard.file_idx = file_idx; shard.file_idx = file_idx;
shard.file_off = file.tell(); shard.file_off = file.tell();
@ -574,7 +583,7 @@ struct llama_file_saver {
file.write_u32(new_type); file.write_u32(new_type);
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
file.write_raw(tensor.name.data(), tensor.name.size()); file.write_raw(tensor.name.data(), tensor.name.size());
file.seek(-file.tell() & 31, SEEK_CUR); file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
file.write_raw(new_data, new_size); file.write_raw(new_data, new_size);
} }
@ -641,7 +650,7 @@ struct llama_model_loader {
} }
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) { struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name); auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) { if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -652,10 +661,10 @@ struct llama_model_loader {
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
} }
return get_tensor_for(lt); return get_tensor_for(lt, backend);
} }
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor; struct ggml_tensor * tensor;
if (lt.ne.size() == 2) { if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -665,6 +674,7 @@ struct llama_model_loader {
} }
ggml_set_name(tensor, lt.name.c_str()); ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
tensor->backend = backend;
lt.ggml_tensor = tensor; lt.ggml_tensor = tensor;
num_ggml_tensors_created++; num_ggml_tensors_created++;
return tensor; return tensor;
@ -678,12 +688,16 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
prefetch_size += lt.size;
}
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) { if (!lmlock) {
// Don't call the callback since the actual loading will be lazy // Don't call the callback since the actual loading will be lazy
// and we can't measure it. // and we can't measure it.
@ -696,6 +710,9 @@ struct llama_model_loader {
size_t done_size = 0; size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) { for (llama_load_tensor & lt : tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
continue;
}
if (progress_callback) { if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data); progress_callback((float) done_size / data_size, progress_callback_user_data);
} }
@ -708,9 +725,6 @@ struct llama_model_loader {
lmlock->grow_to(done_size); lmlock->grow_to(done_size);
} }
} }
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
} }
void load_data_for(llama_load_tensor & lt) { void load_data_for(llama_load_tensor & lt) {
@ -812,10 +826,9 @@ static bool kv_cache_init(
struct llama_context_params llama_context_default_params() { struct llama_context_params llama_context_default_params() {
struct llama_context_params result = { struct llama_context_params result = {
/*.n_ctx =*/ 512, /*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.seed =*/ -1, /*.seed =*/ -1,
/*.f16_kv =*/ false, /*.f16_kv =*/ true,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
@ -836,6 +849,21 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED; return llama_mlock::SUPPORTED;
} }
void llama_init_backend() {
ggml_time_init();
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
}
int64_t llama_time_us() {
return ggml_time_us();
}
// //
// model loading // model loading
// //
@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
} }
return "unknown"; return "unknown";
@ -925,11 +954,19 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
} }
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
}
}
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
} }
} }
@ -942,27 +979,7 @@ static void llama_model_load_internal(
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size); ml->calc_sizes(&ctx_size, &mmapped_size);
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size +
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
}
// create the ggml context // create the ggml context
{ {
@ -984,7 +1001,14 @@ static void llama_model_load_internal(
} }
} }
#ifdef GGML_USE_CUBLAS
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
#else
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
#endif
// prepare memory for the weights // prepare memory for the weights
size_t vram_total = 0;
{ {
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -992,33 +1016,87 @@ static void llama_model_load_internal(
ml->ggml_ctx = ctx; ml->ggml_ctx = ctx;
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}); model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
// "output" tensor
{
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT
backend_output = LLAMA_BACKEND_OFFLOAD;
} else {
backend_output = GGML_BACKEND_CPU;
}
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
}
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
auto & layer = model.layers[i]; auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i); std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
if (backend == GGML_BACKEND_CUDA) {
vram_total +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
} }
} }
ml->done_getting_tensors(); ml->done_getting_tensors();
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size - vram_total + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
#ifdef GGML_USE_CUBLAS
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
#else
(void) n_gpu_layers;
#endif
}
// populate `tensors_by_name` // populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) { for (llama_load_tensor & lt : ml->tensors_map.tensors) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@ -1026,36 +1104,34 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
{ {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); size_t done_size = 0;
size_t data_size = 0;
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); for (llama_load_tensor & lt : ml->tensors_map.tensors) {
data_size += lt.size;
size_t vram_total = 0; if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
done_size += lt.size;
for (int i = 0; i < n_gpu; ++i) { }
const auto & layer = model.layers[i];
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
} }
if (n_gpu_layers > (int) hparams.n_layer) { for (llama_load_tensor & lt : ml->tensors_map.tensors) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
done_size += lt.size;
} }
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
} }
#else #endif // GGML_USE_CUBLAS
(void) n_gpu_layers;
#endif if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
model.mapping = std::move(ml->mapping);
// loading time will be recalculate after the first eval, so // loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration // we take page faults deferred by mmap() into consideration
@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL);
// cur = attention_norm*cur // cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
cur);
} }
// self-attention // self-attention
@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
{ {
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF);
// cur = ffn_norm*cur // cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
cur);
} }
struct ggml_tensor * tmp = ggml_mul_mat(ctx0, struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL);
// inpL = norm*inpL // inpL = inpL*norm(broadcasted)
inpL = ggml_mul(ctx0, inpL = ggml_mul(ctx0, inpL, model.norm);
ggml_repeat(ctx0, model.norm, inpL),
inpL);
embeddings = inpL; embeddings = inpL;
} }
@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
unsigned * cur_percentage_p = (unsigned *) ctx; unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress); unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) { while (percentage > *cur_percentage_p) {
++*cur_percentage_p; *cur_percentage_p = percentage;
fprintf(stderr, "."); fprintf(stderr, ".");
fflush(stderr); fflush(stderr);
if (percentage >= 100) { if (percentage >= 100) {
@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
{ {
uint32_t magic; uint32_t magic;
fin.read((char *) &magic, sizeof(magic)); fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') { if (magic != LLAMA_FILE_MAGIC_GGLA) {
fprintf(stderr, "%s: bad file magic\n", __func__); fprintf(stderr, "%s: bad file magic\n", __func__);
return 1; return 1;
} }
@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
} }
} }
@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
} }
size_t idx = model_loader->tensors_map.name_to_idx[base_name]; size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data; lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt); model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data; lt.ggml_tensor->data = lt.data;
@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
} }
// Sets the state reading from the specified source address // Sets the state reading from the specified source address
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const uint8_t * inp = src; uint8_t * inp = src;
// set rng // set rng
{ {

View File

@ -19,10 +19,16 @@
# define LLAMA_API # define LLAMA_API
#endif #endif
#define LLAMA_FILE_VERSION 2 #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
#define LLAMA_SESSION_MAGIC 'ggsn' #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_VERSION 3
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#ifdef __cplusplus #ifdef __cplusplus
@ -40,9 +46,9 @@ extern "C" {
typedef int llama_token; typedef int llama_token;
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
float p; // probability of the token float p; // probability of the token
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array { typedef struct llama_token_data_array {
@ -55,7 +61,6 @@ extern "C" {
struct llama_context_params { struct llama_context_params {
int n_ctx; // text context int n_ctx; // text context
int n_parts; // -1 for default
int n_gpu_layers; // number of layers to store in VRAM int n_gpu_layers; // number of layers to store in VRAM
int seed; // RNG seed, -1 for random int seed; // RNG seed, -1 for random
@ -74,16 +79,16 @@ extern "C" {
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
}; };
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
@ -91,6 +96,13 @@ extern "C" {
LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported(); LLAMA_API bool llama_mlock_supported();
// TODO: not great API - very likely to change
// Initialize the llama + ggml backend
// Call once at the start of the program
LLAMA_API void llama_init_backend();
LLAMA_API int64_t llama_time_us();
// Various functions for loading a ggml llama model. // Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model. // Allocate (almost) all memory needed for the model.
// Return NULL on failure // Return NULL on failure
@ -139,7 +151,7 @@ extern "C" {
// Set the state reading from the specified address // Set the state reading from the specified address
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
// Save/load session file // Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);

View File

@ -33,8 +33,6 @@ struct whisper_params {
int32_t max_tokens = 32; int32_t max_tokens = 32;
int32_t audio_ctx = 0; int32_t audio_ctx = 0;
int32_t n_parts_llama = -1;
float vad_thold = 0.6f; float vad_thold = 0.6f;
float freq_thold = 100.0f; float freq_thold = 100.0f;
@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str()); fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", ""); fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
@ -239,13 +235,14 @@ int main(int argc, char ** argv) {
// llama init // llama init
llama_init_backend();
auto lparams = llama_context_default_params(); auto lparams = llama_context_default_params();
// tune these to your liking // tune these to your liking
lparams.n_ctx = 2048; lparams.n_ctx = 2048;
lparams.seed = 1; lparams.seed = 1;
lparams.f16_kv = true; lparams.f16_kv = true;
lparams.n_parts = params.n_parts_llama;
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams); struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);