talk-llama : sync latest llama.cpp (close #922, close #954)

This commit is contained in:
Georgi Gerganov
2023-05-23 14:04:39 +03:00
parent 041be06d58
commit 77eab3fbfe
4 changed files with 287 additions and 167 deletions

View File

@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include <stdexcept>
#ifdef __has_include
#if __has_include(<unistd.h>)
@ -74,7 +75,7 @@ struct llama_file {
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw format("failed to open %s: %s", fname, std::strerror(errno));
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
@ -100,17 +101,17 @@ struct llama_file {
LLAMA_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw format("read error: %s", strerror(errno));
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::string("unexpectedly reached end of file");
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
@ -126,14 +127,14 @@ struct llama_file {
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw format("write error: %s", strerror(errno));
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
@ -171,7 +172,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) {
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
@ -180,12 +181,12 @@ struct llama_mmap {
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw format("mmap failed: %s", strerror(errno));
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch) {
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) {
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
@ -207,7 +208,7 @@ struct llama_mmap {
DWORD error = GetLastError();
if (hMapping == NULL) {
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@ -215,7 +216,7 @@ struct llama_mmap {
CloseHandle(hMapping);
if (addr == NULL) {
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@ -243,8 +244,9 @@ struct llama_mmap {
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) {
throw std::string("mmap not supported");
llama_mmap(struct llama_file *, bool prefetch = true) {
(void)prefetch;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
@ -265,9 +267,9 @@ struct llama_mlock {
}
}
void init(void * addr) {
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
this->addr = addr;
void init(void * ptr) {
LLAMA_ASSERT(addr == NULL && size == 0);
addr = ptr;
}
void grow_to(size_t target_size) {
@ -338,14 +340,14 @@ struct llama_mlock {
return (size_t) si.dwPageSize;
}
bool raw_lock(void * addr, size_t size) {
bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
if (VirtualLock(addr, size)) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
size, this->size, llama_format_win_err(GetLastError()).c_str());
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
@ -361,7 +363,7 @@ struct llama_mlock {
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = size + 1048576;
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
@ -373,8 +375,8 @@ struct llama_mlock {
}
}
void raw_unlock(void * addr, size_t size) {
if (!VirtualUnlock(addr, size)) {
void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
@ -382,11 +384,16 @@ struct llama_mlock {
#else
static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) {
fprintf(stderr, "warning: mlock not supported on this system\n");
size_t lock_granularity() {
return (size_t) 65536;
}
void raw_unlock(const void * addr, size_t size) {}
bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif
};
@ -395,36 +402,70 @@ struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
llama_buffer() = default;
void resize(size_t len) {
delete[] addr;
addr = new uint8_t[size];
this->size = size;
addr = new uint8_t[len];
size = len;
}
~llama_buffer() {
delete[] addr;
}
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) {
if (addr) {
ggml_cuda_host_free(addr);
}
free();
addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size;
}
~llama_ctx_buffer() {
void free() {
if (addr) {
ggml_cuda_host_free(addr);
if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
}
addr = NULL;
}
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;