mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-30 08:03:54 +00:00
parent
041be06d58
commit
77eab3fbfe
@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
#ifdef __has_include
|
#ifdef __has_include
|
||||||
#if __has_include(<unistd.h>)
|
#if __has_include(<unistd.h>)
|
||||||
@ -74,7 +75,7 @@ struct llama_file {
|
|||||||
llama_file(const char * fname, const char * mode) {
|
llama_file(const char * fname, const char * mode) {
|
||||||
fp = std::fopen(fname, mode);
|
fp = std::fopen(fname, mode);
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
throw format("failed to open %s: %s", fname, std::strerror(errno));
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||||
}
|
}
|
||||||
seek(0, SEEK_END);
|
seek(0, SEEK_END);
|
||||||
size = tell();
|
size = tell();
|
||||||
@ -100,17 +101,17 @@ struct llama_file {
|
|||||||
LLAMA_ASSERT(ret == 0); // same
|
LLAMA_ASSERT(ret == 0); // same
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t size) {
|
void read_raw(void * ptr, size_t len) const {
|
||||||
if (size == 0) {
|
if (len == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
errno = 0;
|
errno = 0;
|
||||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||||
if (ferror(fp)) {
|
if (ferror(fp)) {
|
||||||
throw format("read error: %s", strerror(errno));
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
if (ret != 1) {
|
if (ret != 1) {
|
||||||
throw std::string("unexpectedly reached end of file");
|
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,14 +127,14 @@ struct llama_file {
|
|||||||
return std::string(chars.data(), len);
|
return std::string(chars.data(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t size) {
|
void write_raw(const void * ptr, size_t len) const {
|
||||||
if (size == 0) {
|
if (len == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
errno = 0;
|
errno = 0;
|
||||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||||
if (ret != 1) {
|
if (ret != 1) {
|
||||||
throw format("write error: %s", strerror(errno));
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,7 +172,7 @@ struct llama_mmap {
|
|||||||
#ifdef _POSIX_MAPPED_FILES
|
#ifdef _POSIX_MAPPED_FILES
|
||||||
static constexpr bool SUPPORTED = true;
|
static constexpr bool SUPPORTED = true;
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
||||||
size = file->size;
|
size = file->size;
|
||||||
int fd = fileno(file->fp);
|
int fd = fileno(file->fp);
|
||||||
int flags = MAP_SHARED;
|
int flags = MAP_SHARED;
|
||||||
@ -180,12 +181,12 @@ struct llama_mmap {
|
|||||||
#endif
|
#endif
|
||||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||||
if (addr == MAP_FAILED) {
|
if (addr == MAP_FAILED) {
|
||||||
throw format("mmap failed: %s", strerror(errno));
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prefetch) {
|
if (prefetch > 0) {
|
||||||
// Advise the kernel to preload the mapped memory
|
// Advise the kernel to preload the mapped memory
|
||||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
}
|
}
|
||||||
@ -207,7 +208,7 @@ struct llama_mmap {
|
|||||||
DWORD error = GetLastError();
|
DWORD error = GetLastError();
|
||||||
|
|
||||||
if (hMapping == NULL) {
|
if (hMapping == NULL) {
|
||||||
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||||
@ -215,7 +216,7 @@ struct llama_mmap {
|
|||||||
CloseHandle(hMapping);
|
CloseHandle(hMapping);
|
||||||
|
|
||||||
if (addr == NULL) {
|
if (addr == NULL) {
|
||||||
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||||
@ -243,8 +244,9 @@ struct llama_mmap {
|
|||||||
#else
|
#else
|
||||||
static constexpr bool SUPPORTED = false;
|
static constexpr bool SUPPORTED = false;
|
||||||
|
|
||||||
llama_mmap(struct llama_file *) {
|
llama_mmap(struct llama_file *, bool prefetch = true) {
|
||||||
throw std::string("mmap not supported");
|
(void)prefetch;
|
||||||
|
throw std::runtime_error(std::string("mmap not supported"));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
@ -265,9 +267,9 @@ struct llama_mlock {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void init(void * addr) {
|
void init(void * ptr) {
|
||||||
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
|
LLAMA_ASSERT(addr == NULL && size == 0);
|
||||||
this->addr = addr;
|
addr = ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void grow_to(size_t target_size) {
|
void grow_to(size_t target_size) {
|
||||||
@ -338,14 +340,14 @@ struct llama_mlock {
|
|||||||
return (size_t) si.dwPageSize;
|
return (size_t) si.dwPageSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool raw_lock(void * addr, size_t size) {
|
bool raw_lock(void * ptr, size_t len) {
|
||||||
for (int tries = 1; ; tries++) {
|
for (int tries = 1; ; tries++) {
|
||||||
if (VirtualLock(addr, size)) {
|
if (VirtualLock(ptr, len)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (tries == 2) {
|
if (tries == 2) {
|
||||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||||
size, this->size, llama_format_win_err(GetLastError()).c_str());
|
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -361,7 +363,7 @@ struct llama_mlock {
|
|||||||
// is equal to the number of pages in its minimum working set minus
|
// is equal to the number of pages in its minimum working set minus
|
||||||
// a small overhead."
|
// a small overhead."
|
||||||
// Hopefully a megabyte is enough overhead:
|
// Hopefully a megabyte is enough overhead:
|
||||||
size_t increment = size + 1048576;
|
size_t increment = len + 1048576;
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
// The minimum must be <= the maximum, so we need to increase both:
|
||||||
min_ws_size += increment;
|
min_ws_size += increment;
|
||||||
max_ws_size += increment;
|
max_ws_size += increment;
|
||||||
@ -373,8 +375,8 @@ struct llama_mlock {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void raw_unlock(void * addr, size_t size) {
|
void raw_unlock(void * ptr, size_t len) {
|
||||||
if (!VirtualUnlock(addr, size)) {
|
if (!VirtualUnlock(ptr, len)) {
|
||||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
}
|
}
|
||||||
@ -382,11 +384,16 @@ struct llama_mlock {
|
|||||||
#else
|
#else
|
||||||
static constexpr bool SUPPORTED = false;
|
static constexpr bool SUPPORTED = false;
|
||||||
|
|
||||||
void raw_lock(const void * addr, size_t size) {
|
size_t lock_granularity() {
|
||||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
return (size_t) 65536;
|
||||||
}
|
}
|
||||||
|
|
||||||
void raw_unlock(const void * addr, size_t size) {}
|
bool raw_lock(const void * addr, size_t len) {
|
||||||
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void raw_unlock(const void * addr, size_t len) {}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -395,36 +402,70 @@ struct llama_buffer {
|
|||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
void resize(size_t size) {
|
llama_buffer() = default;
|
||||||
|
|
||||||
|
void resize(size_t len) {
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
addr = new uint8_t[size];
|
addr = new uint8_t[len];
|
||||||
this->size = size;
|
size = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_buffer() {
|
~llama_buffer() {
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llama_buffer(const llama_buffer&) = delete;
|
||||||
|
llama_buffer(llama_buffer&&) = delete;
|
||||||
|
llama_buffer& operator=(const llama_buffer&) = delete;
|
||||||
|
llama_buffer& operator=(llama_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
struct llama_ctx_buffer {
|
struct llama_ctx_buffer {
|
||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
|
bool is_cuda;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
|
llama_ctx_buffer() = default;
|
||||||
|
|
||||||
void resize(size_t size) {
|
void resize(size_t size) {
|
||||||
if (addr) {
|
free();
|
||||||
ggml_cuda_host_free(addr);
|
|
||||||
}
|
|
||||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||||
|
if (addr) {
|
||||||
|
is_cuda = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// fall back to pageable memory
|
||||||
|
addr = new uint8_t[size];
|
||||||
|
is_cuda = false;
|
||||||
|
}
|
||||||
this->size = size;
|
this->size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_ctx_buffer() {
|
void free() {
|
||||||
if (addr) {
|
if (addr) {
|
||||||
ggml_cuda_host_free(addr);
|
if (is_cuda) {
|
||||||
|
ggml_cuda_host_free(addr);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
delete[] addr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
addr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
~llama_ctx_buffer() {
|
||||||
|
free();
|
||||||
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
#else
|
#else
|
||||||
typedef llama_buffer llama_ctx_buffer;
|
typedef llama_buffer llama_ctx_buffer;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
// Defines fileno on msys:
|
// Defines fileno on msys:
|
||||||
#ifndef _GNU_SOURCE
|
#ifndef _GNU_SOURCE
|
||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#endif
|
#endif
|
||||||
@ -45,6 +46,7 @@ enum e_model {
|
|||||||
MODEL_65B,
|
MODEL_65B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB = 1024*1024;
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
@ -110,7 +112,7 @@ struct llama_hparams {
|
|||||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -406,6 +408,7 @@ enum llama_file_version {
|
|||||||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||||
|
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_file_loader {
|
struct llama_file_loader {
|
||||||
@ -424,24 +427,30 @@ struct llama_file_loader {
|
|||||||
}
|
}
|
||||||
void read_magic() {
|
void read_magic() {
|
||||||
uint32_t magic = file.read_u32();
|
uint32_t magic = file.read_u32();
|
||||||
uint32_t version = 0;
|
|
||||||
|
|
||||||
if (magic != 'ggml') {
|
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
||||||
version = file.read_u32();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (magic == 'ggml' && version == 0) {
|
|
||||||
file_version = LLAMA_FILE_VERSION_GGML;
|
file_version = LLAMA_FILE_VERSION_GGML;
|
||||||
} else if (magic == 'ggmf' && version == 1) {
|
return;
|
||||||
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
|
||||||
} else if (magic == 'ggjt' && version == 1) {
|
|
||||||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
|
||||||
} else if (magic == 'ggjt' && version == 2) {
|
|
||||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
|
||||||
} else {
|
|
||||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
|
||||||
magic, version);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t version = file.read_u32();
|
||||||
|
|
||||||
|
switch (magic) {
|
||||||
|
case LLAMA_FILE_MAGIC_GGMF:
|
||||||
|
switch (version) {
|
||||||
|
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case LLAMA_FILE_MAGIC_GGJT:
|
||||||
|
switch (version) {
|
||||||
|
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
||||||
|
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
||||||
|
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||||
|
magic, version);
|
||||||
}
|
}
|
||||||
void read_hparams() {
|
void read_hparams() {
|
||||||
hparams.n_vocab = file.read_u32();
|
hparams.n_vocab = file.read_u32();
|
||||||
@ -499,7 +508,7 @@ struct llama_file_loader {
|
|||||||
|
|
||||||
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||||
// skip to the next multiple of 32 bytes
|
// skip to the next multiple of 32 bytes
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
}
|
}
|
||||||
shard.file_idx = file_idx;
|
shard.file_idx = file_idx;
|
||||||
shard.file_off = file.tell();
|
shard.file_off = file.tell();
|
||||||
@ -574,7 +583,7 @@ struct llama_file_saver {
|
|||||||
file.write_u32(new_type);
|
file.write_u32(new_type);
|
||||||
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
||||||
file.write_raw(tensor.name.data(), tensor.name.size());
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
||||||
file.write_raw(new_data, new_size);
|
file.write_raw(new_data, new_size);
|
||||||
}
|
}
|
||||||
@ -641,7 +650,7 @@ struct llama_model_loader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||||
@ -652,10 +661,10 @@ struct llama_model_loader {
|
|||||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_tensor_for(lt);
|
return get_tensor_for(lt, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
||||||
struct ggml_tensor * tensor;
|
struct ggml_tensor * tensor;
|
||||||
if (lt.ne.size() == 2) {
|
if (lt.ne.size() == 2) {
|
||||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||||
@ -665,6 +674,7 @@ struct llama_model_loader {
|
|||||||
}
|
}
|
||||||
ggml_set_name(tensor, lt.name.c_str());
|
ggml_set_name(tensor, lt.name.c_str());
|
||||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||||
|
tensor->backend = backend;
|
||||||
lt.ggml_tensor = tensor;
|
lt.ggml_tensor = tensor;
|
||||||
num_ggml_tensors_created++;
|
num_ggml_tensors_created++;
|
||||||
return tensor;
|
return tensor;
|
||||||
@ -678,12 +688,16 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
size_t prefetch_size = 0;
|
||||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||||
data_size += lt.size;
|
data_size += lt.size;
|
||||||
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||||
|
prefetch_size += lt.size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||||
if (!lmlock) {
|
if (!lmlock) {
|
||||||
// Don't call the callback since the actual loading will be lazy
|
// Don't call the callback since the actual loading will be lazy
|
||||||
// and we can't measure it.
|
// and we can't measure it.
|
||||||
@ -696,6 +710,9 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
size_t done_size = 0;
|
size_t done_size = 0;
|
||||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
||||||
|
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||||
}
|
}
|
||||||
@ -708,9 +725,6 @@ struct llama_model_loader {
|
|||||||
lmlock->grow_to(done_size);
|
lmlock->grow_to(done_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (progress_callback) {
|
|
||||||
progress_callback(1.0f, progress_callback_user_data);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_data_for(llama_load_tensor & lt) {
|
void load_data_for(llama_load_tensor & lt) {
|
||||||
@ -812,10 +826,9 @@ static bool kv_cache_init(
|
|||||||
struct llama_context_params llama_context_default_params() {
|
struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.n_parts =*/ -1,
|
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.seed =*/ -1,
|
/*.seed =*/ -1,
|
||||||
/*.f16_kv =*/ false,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
@ -836,6 +849,21 @@ bool llama_mlock_supported() {
|
|||||||
return llama_mlock::SUPPORTED;
|
return llama_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_init_backend() {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
// needed to initialize f16 tables
|
||||||
|
{
|
||||||
|
struct ggml_init_params params = { 0, NULL, false };
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t llama_time_us() {
|
||||||
|
return ggml_time_us();
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// model loading
|
// model loading
|
||||||
//
|
//
|
||||||
@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|||||||
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
||||||
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
||||||
|
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
@ -925,11 +954,19 @@ static void llama_model_load_internal(
|
|||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
||||||
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||||
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||||
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
|
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -942,27 +979,7 @@ static void llama_model_load_internal(
|
|||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
ml->calc_sizes(&ctx_size, &mmapped_size);
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
||||||
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||||
|
|
||||||
// print memory requirements
|
|
||||||
{
|
|
||||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
||||||
|
|
||||||
// this is the total memory required to run the inference
|
|
||||||
const size_t mem_required =
|
|
||||||
ctx_size +
|
|
||||||
mmapped_size +
|
|
||||||
MEM_REQ_SCRATCH0().at(model.type) +
|
|
||||||
MEM_REQ_SCRATCH1().at(model.type) +
|
|
||||||
MEM_REQ_EVAL().at(model.type);
|
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
|
||||||
const size_t mem_required_state =
|
|
||||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
@ -984,7 +1001,14 @@ static void llama_model_load_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
||||||
|
#else
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||||
|
#endif
|
||||||
|
|
||||||
// prepare memory for the weights
|
// prepare memory for the weights
|
||||||
|
size_t vram_total = 0;
|
||||||
{
|
{
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
@ -992,33 +1016,87 @@ static void llama_model_load_internal(
|
|||||||
|
|
||||||
ml->ggml_ctx = ctx;
|
ml->ggml_ctx = ctx;
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
|
||||||
|
// "output" tensor
|
||||||
|
{
|
||||||
|
ggml_backend backend_output;
|
||||||
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||||
|
backend_output = LLAMA_BACKEND_OFFLOAD;
|
||||||
|
} else {
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
std::string layers_i = "layers." + std::to_string(i);
|
std::string layers_i = "layers." + std::to_string(i);
|
||||||
|
|
||||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||||
|
|
||||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
||||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
||||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
||||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
||||||
|
|
||||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
||||||
|
|
||||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_CUDA) {
|
||||||
|
vram_total +=
|
||||||
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||||
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ml->done_getting_tensors();
|
ml->done_getting_tensors();
|
||||||
|
|
||||||
|
// print memory requirements
|
||||||
|
{
|
||||||
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||||
|
|
||||||
|
// this is the total memory required to run the inference
|
||||||
|
const size_t mem_required =
|
||||||
|
ctx_size +
|
||||||
|
mmapped_size - vram_total + // weights in VRAM not in memory
|
||||||
|
MEM_REQ_SCRATCH0().at(model.type) +
|
||||||
|
MEM_REQ_SCRATCH1().at(model.type) +
|
||||||
|
MEM_REQ_EVAL().at(model.type);
|
||||||
|
|
||||||
|
// this is the memory required by one llama_state
|
||||||
|
const size_t mem_required_state =
|
||||||
|
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||||
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||||
|
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
#else
|
||||||
|
(void) n_gpu_layers;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
||||||
@ -1026,36 +1104,34 @@ static void llama_model_load_internal(
|
|||||||
|
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||||
|
|
||||||
model.mapping = std::move(ml->mapping);
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
{
|
{
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
size_t done_size = 0;
|
||||||
|
size_t data_size = 0;
|
||||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
|
data_size += lt.size;
|
||||||
size_t vram_total = 0;
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||||
|
done_size += lt.size;
|
||||||
for (int i = 0; i < n_gpu; ++i) {
|
}
|
||||||
const auto & layer = model.layers[i];
|
|
||||||
|
|
||||||
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
|
||||||
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
|
||||||
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
|
||||||
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
|
||||||
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
|
||||||
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
|
||||||
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
|
||||||
}
|
}
|
||||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
||||||
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
continue;
|
||||||
|
}
|
||||||
|
if (progress_callback) {
|
||||||
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||||
|
}
|
||||||
|
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||||
|
done_size += lt.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
|
||||||
}
|
}
|
||||||
#else
|
#endif // GGML_USE_CUBLAS
|
||||||
(void) n_gpu_layers;
|
|
||||||
#endif
|
if (progress_callback) {
|
||||||
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
model.mapping = std::move(ml->mapping);
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
|
|||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = cur*attention_norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0,
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||||
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
|
||||||
cur);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
|
|||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = cur*ffn_norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0,
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||||
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
|
||||||
cur);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||||
@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
|
|||||||
|
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = inpL*norm(broadcasted)
|
||||||
inpL = ggml_mul(ctx0,
|
inpL = ggml_mul(ctx0, inpL, model.norm);
|
||||||
ggml_repeat(ctx0, model.norm, inpL),
|
|
||||||
inpL);
|
|
||||||
|
|
||||||
embeddings = inpL;
|
embeddings = inpL;
|
||||||
}
|
}
|
||||||
@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
|||||||
unsigned * cur_percentage_p = (unsigned *) ctx;
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
||||||
unsigned percentage = (unsigned) (100 * progress);
|
unsigned percentage = (unsigned) (100 * progress);
|
||||||
while (percentage > *cur_percentage_p) {
|
while (percentage > *cur_percentage_p) {
|
||||||
++*cur_percentage_p;
|
*cur_percentage_p = percentage;
|
||||||
fprintf(stderr, ".");
|
fprintf(stderr, ".");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
if (percentage >= 100) {
|
if (percentage >= 100) {
|
||||||
@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
fin.read((char *) &magic, sizeof(magic));
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 'ggla') {
|
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||||||
|
|
||||||
// maybe this should in llama_model_loader
|
// maybe this should in llama_model_loader
|
||||||
if (model_loader->use_mmap) {
|
if (model_loader->use_mmap) {
|
||||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||||||
}
|
}
|
||||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||||
model_loader->load_data_for(lt);
|
model_loader->load_data_for(lt);
|
||||||
lt.ggml_tensor->data = lt.data;
|
lt.ggml_tensor->data = lt.data;
|
||||||
@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sets the state reading from the specified source address
|
// Sets the state reading from the specified source address
|
||||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
const uint8_t * inp = src;
|
uint8_t * inp = src;
|
||||||
|
|
||||||
// set rng
|
// set rng
|
||||||
{
|
{
|
||||||
|
@ -19,10 +19,16 @@
|
|||||||
# define LLAMA_API
|
# define LLAMA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_VERSION 2
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||||
#define LLAMA_FILE_MAGIC 'ggjt'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||||
#define LLAMA_SESSION_MAGIC 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||||
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
|
#define LLAMA_FILE_VERSION 3
|
||||||
|
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||||
|
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||||
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 1
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
@ -40,9 +46,9 @@ extern "C" {
|
|||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
float logit; // log-odds of the token
|
float logit; // log-odds of the token
|
||||||
float p; // probability of the token
|
float p; // probability of the token
|
||||||
} llama_token_data;
|
} llama_token_data;
|
||||||
|
|
||||||
typedef struct llama_token_data_array {
|
typedef struct llama_token_data_array {
|
||||||
@ -55,7 +61,6 @@ extern "C" {
|
|||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
int n_ctx; // text context
|
int n_ctx; // text context
|
||||||
int n_parts; // -1 for default
|
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int n_gpu_layers; // number of layers to store in VRAM
|
||||||
int seed; // RNG seed, -1 for random
|
int seed; // RNG seed, -1 for random
|
||||||
|
|
||||||
@ -74,16 +79,16 @@ extern "C" {
|
|||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
@ -91,6 +96,13 @@ extern "C" {
|
|||||||
LLAMA_API bool llama_mmap_supported();
|
LLAMA_API bool llama_mmap_supported();
|
||||||
LLAMA_API bool llama_mlock_supported();
|
LLAMA_API bool llama_mlock_supported();
|
||||||
|
|
||||||
|
// TODO: not great API - very likely to change
|
||||||
|
// Initialize the llama + ggml backend
|
||||||
|
// Call once at the start of the program
|
||||||
|
LLAMA_API void llama_init_backend();
|
||||||
|
|
||||||
|
LLAMA_API int64_t llama_time_us();
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
// Various functions for loading a ggml llama model.
|
||||||
// Allocate (almost) all memory needed for the model.
|
// Allocate (almost) all memory needed for the model.
|
||||||
// Return NULL on failure
|
// Return NULL on failure
|
||||||
@ -139,7 +151,7 @@ extern "C" {
|
|||||||
|
|
||||||
// Set the state reading from the specified address
|
// Set the state reading from the specified address
|
||||||
// Returns the number of bytes read
|
// Returns the number of bytes read
|
||||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||||
|
|
||||||
// Save/load session file
|
// Save/load session file
|
||||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||||
|
@ -33,8 +33,6 @@ struct whisper_params {
|
|||||||
int32_t max_tokens = 32;
|
int32_t max_tokens = 32;
|
||||||
int32_t audio_ctx = 0;
|
int32_t audio_ctx = 0;
|
||||||
|
|
||||||
int32_t n_parts_llama = -1;
|
|
||||||
|
|
||||||
float vad_thold = 0.6f;
|
float vad_thold = 0.6f;
|
||||||
float freq_thold = 100.0f;
|
float freq_thold = 100.0f;
|
||||||
|
|
||||||
@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|||||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||||
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
|
|
||||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||||
@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||||
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
||||||
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
||||||
fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
|
|
||||||
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
||||||
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
||||||
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
||||||
@ -239,13 +235,14 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// llama init
|
// llama init
|
||||||
|
|
||||||
|
llama_init_backend();
|
||||||
|
|
||||||
auto lparams = llama_context_default_params();
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
// tune these to your liking
|
// tune these to your liking
|
||||||
lparams.n_ctx = 2048;
|
lparams.n_ctx = 2048;
|
||||||
lparams.seed = 1;
|
lparams.seed = 1;
|
||||||
lparams.f16_kv = true;
|
lparams.f16_kv = true;
|
||||||
lparams.n_parts = params.n_parts_llama;
|
|
||||||
|
|
||||||
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user