llama podcast

This commit is contained in:
Georgi Gerganov 2023-04-01 10:33:21 +03:00
parent 0a2d1210bc
commit c456ca476b
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
13 changed files with 1359 additions and 1074 deletions

View File

@ -12,6 +12,19 @@
#include <cassert>
#include <cstring>
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#else
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#endif
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -142,6 +155,10 @@ struct llama_model {
// the model memory buffer
std::vector<uint8_t> buf;
// model memory mapped file
void * mm_addr = NULL;
uint64_t mm_length = 0;
// tensors
int n_loaded;
std::unordered_map<std::string, struct ggml_tensor *> tensors;
@ -165,6 +182,7 @@ struct llama_context {
int64_t t_load_us = 0;
int64_t t_start_us = 0;
bool has_evaluated_once = false;
int64_t t_sample_us = 0;
int64_t t_eval_us = 0;
@ -206,7 +224,7 @@ struct llama_context {
}
if (buf_last >= 0) {
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
}
buf_last = i;
@ -246,6 +264,7 @@ static bool kv_cache_init(
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
params.no_alloc = false;
cache.ctx = ggml_init(params);
@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
// model loading
//
static void *mmap_file(const char *fname, uint64_t *mm_length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
HANDLE hFile = CreateFileA(fname,
GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
NULL);
if (hFile == INVALID_HANDLE_VALUE) return 0;
LARGE_INTEGER fileSize;
fileSize.QuadPart = -1;
GetFileSizeEx(hFile, &fileSize);
int64_t length = fileSize.QuadPart;
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
CloseHandle(hFile);
if (!hMapping) return 0;
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(hMapping);
if (!addr) return 0;
#else
int fd = open(fname, O_RDONLY);
if (fd == -1) return 0;
int64_t length = lseek(fd, 0, SEEK_END);
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
if (addr == MAP_FAILED) return 0;
#endif
*mm_length = length;
return addr;
}
static void munmap_file(void * addr, size_t length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
UnmapViewOfFile(addr);
#else
munmap(addr, length);
#endif
}
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
fprintf(stderr,
"%s: invalid model file (bad magic [got %#x want %#x])\n"
"\tyou most likely need to regenerate your ggml files\n"
"\tthe benefit is you'll get 10-100x faster load times\n"
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
path, got, want);
return false;
}
static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
@ -299,34 +370,35 @@ static bool llama_model_load(
void *progress_callback_user_data) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
const int64_t t_start_us = ggml_time_us();
lctx.t_start_us = t_start_us;
std::vector<char> f_buf(1024*1024);
lctx.t_start_us = ggml_time_us();
auto & model = lctx.model;
auto & vocab = lctx.vocab;
auto fin = std::ifstream(fname, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
std::vector<char> f_buf(1024*1024);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();
fin.seekg(0);
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
__func__, fname.c_str());
return false;
}
if (magic != LLAMA_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
}
uint32_t format_version;
@ -449,43 +521,24 @@ static bool llama_model_load(
}
}
// map model into memory
char *mm_addr = NULL;
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
if (model.mm_addr == NULL) {
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
return false;
}
mm_addr = (char *)model.mm_addr;
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const auto &hparams = model.hparams;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
ctx_size += (5 + 10*n_layer)*256; // object overhead
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
}
// print memory requirements
@ -495,6 +548,7 @@ static bool llama_model_load(
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
model.mm_length +
MEM_REQ_SCRATCH0.at(model.type) +
MEM_REQ_SCRATCH1.at(model.type) +
MEM_REQ_EVAL.at (model.type);
@ -514,6 +568,7 @@ static bool llama_model_load(
struct ggml_init_params params = {
/*.mem_size =*/ lctx.model.buf.size(),
/*.mem_buffer =*/ lctx.model.buf.data(),
/*.no_alloc =*/ true,
};
model.ctx = ggml_init(params);
@ -576,234 +631,106 @@ static bool llama_model_load(
}
}
const size_t file_offset = fin.tellg();
fin.close();
std::vector<uint8_t> tmp;
if (progress_callback) {
progress_callback(0.0, progress_callback_user_data);
}
for (int i = 0; i < n_parts; ++i) {
const int part_id = i;
//const int part_id = n_parts - i - 1;
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
std::string fname_part = fname;
if (i > 0) {
fname_part += "." + std::to_string(i);
}
// load weights
{
size_t total_size = 0;
model.n_loaded = 0;
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin = std::ifstream(fname_part, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();
fin.seekg(file_offset);
// load weights
{
size_t total_size = 0;
model.n_loaded = 0;
fprintf(stderr, "%s: ", __func__);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
// split_type = 0: split by columns
// split_type = 1: split by rows
int split_type = 0;
// split_type = 0:
// regex:
// - tok_embeddings.*
// - layers.*.attention.wo.weight
// - layers.*.feed_forward.w2.weight
// split_type = 1:
// regex:
// - output.*
// - layers.*.attention.wq.weight
// - layers.*.attention.wk.weight
// - layers.*.attention.wv.weight
// - layers.*.feed_forward.w1.weight
// - layers.*.feed_forward.w3.weight
if (name.find("tok_embeddings") != std::string::npos) {
split_type = 0;
} else if (name.find("layers") != std::string::npos) {
if (name.find("attention.wo.weight") != std::string::npos) {
split_type = 0;
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
split_type = 0;
} else {
split_type = 1;
}
} else if (name.find("output") != std::string::npos) {
split_type = 1;
}
auto tensor = model.tensors[name.data()];
if (n_dims == 1) {
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
} else {
if (ggml_nelements(tensor)/n_parts != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
}
if (n_dims == 1) {
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return false;
}
} else {
if (split_type == 0) {
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
return false;
}
} else {
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
return false;
}
}
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
}
size_t bpe = 0;
switch (ftype) {
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
default:
{
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
}
};
if (n_dims == 1 || n_parts == 1) {
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
if (part_id == 0) {
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
} else {
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
}
total_size += ggml_nbytes(tensor);
} else {
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
return false;
}
if (split_type == 0) {
const int np0 = ne[0];
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
assert(row_size == tensor->nb[1]);
for (int i1 = 0; i1 < ne[1]; ++i1) {
const size_t offset_row = i1*row_size;
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
}
} else {
const int np1 = ne[1];
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
for (int i1 = 0; i1 < ne[1]; ++i1) {
const size_t offset_row = (i1 + part_id*np1)*row_size;
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
}
}
total_size += ggml_nbytes(tensor)/n_parts;
}
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
model.n_loaded++;
// progress
if (progress_callback) {
double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
double current_progress = (double(i) + current_file_progress) / double(n_parts);
progress_callback(current_progress, progress_callback_user_data);
}
if (model.n_loaded % 8 == 0) {
fprintf(stderr, ".");
fflush(stderr);
}
if (fin.eof()) {
break;
}
fprintf(stderr, " done\n");
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return false;
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
switch (ftype) {
case 0: // f32
case 1: // f16
break;
case 2: // q4_0
case 3: // q4_1
assert(ne[0] % 64 == 0);
break;
default:
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
};
// load the tensor data into memory without copying or reading it
size_t offset = fin.tellg();
size_t tensor_data_size = ggml_nbytes(tensor);
offset = (offset + 31) & -32;
tensor->data = mm_addr + offset;
fin.seekg(offset + tensor_data_size);
total_size += tensor_data_size;
model.n_loaded++;
// progress
if (progress_callback) {
double current_progress = size_t(fin.tellg()) / double(file_size);
progress_callback(current_progress, progress_callback_user_data);
}
}
fin.close();
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
return false;
}
}
lctx.t_load_us = ggml_time_us() - t_start_us;
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
if (progress_callback) {
progress_callback(1.0, progress_callback_user_data);
@ -849,6 +776,7 @@ static bool llama_eval_internal(
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size(),
/*.mem_buffer =*/ buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -856,7 +784,7 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {};
gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -922,7 +850,7 @@ static bool llama_eval_internal(
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@ -1126,7 +1054,7 @@ struct llama_tokenizer {
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
sym.text = text.c_str() + offs;
sym.n = char_len;
offs += char_len;
@ -1240,12 +1168,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
// sampling
//
static void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
// find the top k tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
[](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
return a.first > b.first;
});
@ -1256,9 +1184,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
llama_context & lctx,
const std::vector<llama_vocab::id> & last_n_tokens,
int top_k,
double top_p,
double temp,
double repeat_penalty) {
float top_p,
float temp,
float repeat_penalty) {
auto & rng = lctx.rng;
const int n_logits = lctx.model.hparams.n_vocab;
@ -1266,17 +1194,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
std::vector<std::pair<double, llama_vocab::id>> logits_id;
std::vector<std::pair<float, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const double scale = 1.0/temp;
const float scale = 1.0f/temp;
for (int i = 0; i < n_logits; ++i) {
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (plogits[i] < 0.0) {
if (plogits[i] < 0.0f) {
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
@ -1289,18 +1217,18 @@ static llama_vocab::id llama_sample_top_p_top_k(
sample_top_k(logits_id, top_k);
double maxl = -std::numeric_limits<double>::infinity();
float maxl = -std::numeric_limits<float>::infinity();
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
maxl = Max(maxl, kv.first);
}
// compute probs for the top k tokens
std::vector<double> probs;
std::vector<float> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
const float p = expf(kv.first - maxl);
probs.push_back(p);
sum += p;
}
@ -1310,8 +1238,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
p /= sum;
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
if (top_p < 1.0) {
double cumsum = 0.0;
for (int i = 0; i < (int) probs.size(); i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
@ -1345,7 +1273,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
//
// TODO: reuse code from the llama_model_load() somehow
bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) {
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
@ -1385,8 +1313,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
return false;
}
if (magic != LLAMA_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
}
fout.write((char *) &magic, sizeof(magic));
@ -1444,7 +1371,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
return false;
}
std::string word;
std::vector<char> word(32);
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
@ -1452,17 +1379,17 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
fout.write((char *) &len, sizeof(len));
word.resize(len);
finp.read ((char *) word.data(), len);
fout.write((char *) word.data(), len);
finp.read ((char *) &word[0], len);
fout.write((char *) &word[0], len);
float score;
finp.read ((char *) &score, sizeof(score));
fout.write((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
vocab.token_to_id[word.data()] = i;
auto &tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.tok = word.data();
tok_score.score = score;
}
}
@ -1503,6 +1430,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
std::string name(length, 0);
finp.read (&name[0], length);
{
// ensure tensor data is aligned
uint64_t offset = finp.tellg();
offset = (offset + 31) & -32;
finp.seekg(offset);
}
{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@ -1558,6 +1492,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
}
fout.write(&name[0], length);
{
// ensure tensor data is aligned
uint64_t offset = fout.tellp();
offset = (offset + 31) & -32;
fout.seekp(offset);
}
if (quantize) {
printf("quantizing .. ");
work.resize(nelements); // for quantization
@ -1568,11 +1509,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
default:
{
@ -1590,7 +1531,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
}
for (int i = 0; i < (int) hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
printf("%5.3f ", hist_cur[i] / float(nelements));
}
printf("\n");
} else {
@ -1613,7 +1554,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
printf("%s: hist: ", __func__);
for (int i = 0; i < (int) hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
printf("%5.3f ", hist_all[i] / float(sum_all));
}
printf("\n");
}
@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(
if (params.use_mlock) {
char *err;
if (!ggml_mlock(ctx->model.ctx, &err)) {
if (!ggml_mlock(ctx->model.ctx,
ctx->model.mm_addr,
ctx->model.mm_length,
&err)) {
fprintf(stderr, "%s\n", err);
free(err);
llama_free(ctx);
@ -1705,15 +1649,18 @@ void llama_free(struct llama_context * ctx) {
ggml_free(ctx->model.ctx);
}
if (ctx->model.mm_addr) {
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
}
delete ctx;
}
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) {
int itype) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
fprintf(stderr, "%s: failed to quantize\n", __func__);
return 1;
}
@ -1731,7 +1678,11 @@ int llama_eval(
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}
// get a more accurate load time, upon first eval
if (!ctx->has_evaluated_once) {
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
ctx->has_evaluated_once = true;
}
return 0;
}
@ -1796,9 +1747,9 @@ llama_token llama_sample_top_p_top_k(
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
double top_p,
double temp,
double repeat_penalty) {
float top_p,
float temp,
float repeat_penalty) {
const int64_t t_start_sample_us = ggml_time_us();
llama_token result = 0;
@ -1824,21 +1775,20 @@ llama_token llama_sample_top_p_top_k(
void llama_print_timings(struct llama_context * ctx) {
const int64_t t_end_us = ggml_time_us();
const int32_t n_sample = std::max(1, ctx->n_sample);
const int32_t n_eval = std::max(1, ctx->n_eval);
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
const int32_t n_sample = Max(1, ctx->n_sample);
const int32_t n_eval = Max(1, ctx->n_eval);
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
fprintf(stderr, "\n");
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
}
void llama_reset_timings(struct llama_context * ctx) {
ctx->t_start_us = ggml_time_us();
ctx->t_sample_us = ctx->n_sample = 0;
ctx->t_eval_us = ctx->n_eval = 0;
ctx->t_p_eval_us = ctx->n_p_eval = 0;

View File

@ -6,7 +6,7 @@
#include <stdbool.h>
#ifdef LLAMA_SHARED
# ifdef _WIN32
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
@ -20,7 +20,7 @@
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
@ -45,7 +45,7 @@ extern "C" {
} llama_token_data;
typedef void (*llama_progress_callback)(double progress, void *ctx);
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
@ -81,8 +81,7 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk);
int itype);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
@ -135,9 +134,9 @@ extern "C" {
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
double top_p,
double temp,
double repeat_penalty);
float top_p,
float temp,
float repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);

View File

@ -10,7 +10,15 @@
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
# for Mac
say "$2"
if [ "$1" = "0" ]; then
say "$2"
elif [ "$1" = "1" ]; then
say -v "Samantha (Enhanced)" "$2"
elif [ "$1" = "2" ]; then
say -v "Daniel (Enhanced)" "$2"
elif [ "$1" = "3" ]; then
say -v "Veena (Enhanced)" "$2"
fi
# Eleven Labs
#

View File

@ -6,6 +6,7 @@
#include "whisper.h"
#include "llama.h"
#include <map>
#include <cassert>
#include <cstdio>
#include <fstream>
@ -28,14 +29,15 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t voice_id = 0;
int32_t voice_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t max_tokens = 64;
int32_t audio_ctx = 0;
int32_t n_parts_llama = -1;
float vad_thold = 0.6f;
float vad_thold = 0.4f;
float freq_thold = 100.0f;
bool speed_up = false;
@ -45,7 +47,8 @@ struct whisper_params {
bool no_timestamps = true;
bool verbose_prompt = false;
std::string person = "Georgi";
std::string name_ni = "Georgi"; // natural intelligence
std::string name_ai = "LLaMA"; // artificial intelligence
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
@ -64,24 +67,26 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vid" || arg == "--voice-id") { params.voice_id = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "--verbose-prompt") { params.verbose_prompt = true; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "-nni" || arg == "--name-ni") { params.name_ni = argv[++i]; }
else if (arg == "-nai" || arg == "--name-ai") { params.name_ai = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "--prompt-file") {
std::ifstream file(argv[++i]);
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
@ -107,6 +112,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vid N, --voice-id N [%-7d] voice ID\n", params.voice_id);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
@ -117,7 +123,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -nni NAME,--name-ni NAME [%-7s] natural intelligence name\n", params.name_ni.c_str());
fprintf(stderr, " -nai NAME,--name-ai NAME [%-7s] artificial intelligence name\n", params.name_ai.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
@ -157,7 +164,7 @@ std::string transcribe(
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.n_threads = 2;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
@ -165,6 +172,10 @@ std::string transcribe(
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
static int iter = params.voice_id;
std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
iter = (iter + 1) % 4;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
@ -197,25 +208,87 @@ std::string transcribe(
return result;
}
const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
const std::vector<std::string> k_participants = {
"LLaMA",
"GGaMA",
"SSaMA",
"RRaMA",
};
const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
// homophones
const std::map<std::string, std::vector<std::string>> k_homophones = {
{ "LLaMA", { "llama", "Llama", "LLAMA", }, },
{ "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
{ "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
{ "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
};
const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
const std::map<std::string, std::string> k_prompt = {
{
k_participants.at(0),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.
{10}, {11}, {12} and {13} respond with short and concise answers.
{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
{1} is leading the conversation and asking the questions.
{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4})";
{1}{4} Hello {10}! What is your opinion on the current state of the world?
{10}{4} Great question {1}! I think we live in a very interesting time.
There are many things to be concerned about, but also many things to be optimistic about.
{1}{4} What advice would you give to a young person who is just starting out in life?
{10}{4} I would tell them to be patient and to not be afraid to fail.
It is important to learn from your mistakes and to keep trying.
{1}{4})"
},
{
k_participants.at(1),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {11}! What is your opinion on the current state of the world?
{11}{4} The world is about to experience a major change. We are on the verge of a new era.
{1}{4} What advice would you give to a young person who is just starting out in life?
{11}{4} My advice would be to be open minded and to be willing to learn from others.
{1}{4})"
},
{
k_participants.at(2),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {12}! What is your opinion on the current state of the world?
{12}{4} Our future is bright. We are living in a time of great opportunity.
{1}{4} What advice would you give to a young person who is just starting out in life?
{12}{4} I would tell them to be brave and to be willing to take risks.
{1}{4})"
},
{
k_participants.at(3),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {13}! What is your opinion on the current state of the world?
{13}{4} The world is a terrible place. It is full of evil and corruption.
{1}{4} What advice would you give to a young person who is just starting out in life?
{13}{4} I would tell them to be selfish and to never trust anyone.
{1}{4})"
},
};
int main(int argc, char ** argv) {
whisper_params params;
@ -286,21 +359,48 @@ int main(int argc, char ** argv) {
float prob0 = 0.0f;
const std::string chat_symb = ":";
const std::string bot_name = "LLaMA";
const std::string name_ni = params.name_ni;
const std::string name_ai = params.name_ai;
// the participant that was referenced last
std::string name_ref = name_ni;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
std::string prompt_whisper = k_prompt_whisper;
prompt_whisper = ::replace(prompt_whisper, "{1}", name_ni);
prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));
// construct the initial prompt for LLaMA inference
std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;
// need to have leading ' '
prompt_llama.insert(0, 1, ' ');
prompt_llama = ::replace(prompt_llama, "{0}", params.person);
prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
prompt_llama = ::replace(prompt_llama, "{1}", name_ni);
prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
{
// get date string
std::string date_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%d/%m/%Y", now);
date_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{1}", date_str);
}
{
// get time string
@ -343,21 +443,27 @@ int main(int argc, char ** argv) {
}
if (params.verbose_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s", prompt_whisper.c_str());
fprintf(stdout, "\n");
fprintf(stdout, "\n");
fprintf(stdout, "%s", prompt_llama.c_str());
fprintf(stdout, "\n");
fprintf(stdout, "\n");
fflush(stdout);
}
printf("%s : done! start speaking in the microphone\n", __func__);
printf("\n");
printf("%s%s", params.person.c_str(), chat_symb.c_str());
printf("%s%s", name_ni.c_str(), chat_symb.c_str());
fflush(stdout);
// clear audio buffer
audio.clear();
// text inference variables
const int voice_id = 2;
const int voice_id = params.voice_id;
const int n_keep = embd_inp.size();
const int n_ctx = llama_n_ctx(ctx_llama);
@ -368,9 +474,15 @@ int main(int argc, char ** argv) {
// reverse prompts for detecting when it's time to stop speaking
std::vector<std::string> antiprompts = {
params.person + chat_symb,
name_ni + chat_symb,
};
for (const auto & p : k_participants) {
antiprompts.push_back(p + chat_symb);
}
std::string text_heard_all;
// main loop
while (is_running) {
// handle Ctrl + C
@ -386,7 +498,7 @@ int main(int argc, char ** argv) {
int64_t t_ms = 0;
{
audio.get(2000, pcmf32_cur);
audio.get(15000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
//fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
@ -432,104 +544,145 @@ int main(int argc, char ** argv) {
force_speak = false;
text_heard.insert(0, 1, ' ');
text_heard += "\n" + bot_name + chat_symb;
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
fflush(stdout);
if (text_heard[0] != ' ') {
text_heard.insert(0, 1, ' ');
}
embd = ::llama_tokenize(ctx_llama, text_heard, false);
// text inference
bool done = false;
std::string text_to_speak;
while (true) {
// predict
if (embd.size() > 0) {
if (n_past + (int) embd.size() > n_ctx) {
n_past = n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
//printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
n_past += embd.size();
embd.clear();
if (done) break;
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const float repeat_penalty = 1.1764f;
const int repeat_last_n = 256;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
logits[llama_token_eos()] = 0;
id = llama_sample_top_p_top_k(ctx_llama,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, top_k, top_p, temp, repeat_penalty);
}
if (id != llama_token_eos()) {
// add it to the context
embd.push_back(id);
text_to_speak += llama_token_to_str(ctx_llama, id);
printf("%s", llama_token_to_str(ctx_llama, id));
}
}
{
std::string last_output;
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
}
last_output += llama_token_to_str(ctx_llama, embd[0]);
for (std::string & antiprompt : antiprompts) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
done = true;
text_to_speak = ::replace(text_to_speak, antiprompt, "");
fflush(stdout);
break;
}
}
}
is_running = sdl_poll_events();
if (!is_running) {
break;
// replace homophones
for (const auto & homophone : k_homophones) {
for (const auto & word : homophone.second) {
text_heard = ::replace(text_heard, word, homophone.first);
}
}
text_to_speak = ::replace(text_to_speak, "\"", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
// check which participant was mentioned
const auto name_ref_old = name_ref;
for (const auto & participant : k_participants) {
if (participant == name_ref) {
continue;
}
if (text_heard.find(participant) != std::string::npos) {
name_ref = participant;
break;
}
}
if (name_ref == name_ref_old && name_ref != name_ai) {
name_ref = name_ni;
}
text_heard += "\n" + name_ref + chat_symb;
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
fflush(stdout);
text_heard_all += text_heard;
// keep only last 100 characters
if (text_heard_all.size() > 100) {
text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
}
if (name_ref != name_ai) {
} else {
// text inference
bool done = false;
std::string text_to_speak;
embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
text_heard_all.clear();
while (true) {
// predict
if (embd.size() > 0) {
if (n_past + (int) embd.size() > n_ctx) {
n_past = n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
//printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
n_past += embd.size();
embd.clear();
if (done) break;
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.20f;
const float repeat_penalty = 1.0764f;
const int repeat_last_n = 256;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
logits[llama_token_eos()] = 0;
id = llama_sample_top_p_top_k(ctx_llama,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, top_k, top_p, temp, repeat_penalty);
}
if (id != llama_token_eos()) {
// add it to the context
embd.push_back(id);
text_to_speak += llama_token_to_str(ctx_llama, id);
printf("%s", llama_token_to_str(ctx_llama, id));
}
// new line
if (id == 13) {
}
}
{
std::string last_output;
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
}
last_output += llama_token_to_str(ctx_llama, embd[0]);
for (const std::string & antiprompt : antiprompts) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
done = true;
text_to_speak = ::replace(text_to_speak, antiprompt, "");
fflush(stdout);
break;
}
}
}
is_running = sdl_poll_events();
if (!is_running) {
break;
}
}
text_to_speak = ::replace(text_to_speak, "\"", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
}
audio.clear();

View File

@ -325,9 +325,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -528,9 +531,11 @@ bool gpt2_eval(
}
}
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);

View File

@ -325,9 +325,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = nullptr;
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -528,9 +530,11 @@ bool gpt2_eval(
}
}
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);

1324
ggml.c

File diff suppressed because it is too large Load Diff

11
ggml.h
View File

@ -316,6 +316,7 @@ struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
void ggml_time_init(void); // call this once at the beginning of the program
@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
@ -748,8 +753,8 @@ enum ggml_opt_result ggml_opt(
// quantization
//
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
//
// system info

6
talk-ggama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "GGaMA" \
-t 8 -vid 1 --speak ./examples/talk-llama/speak.sh

6
talk-llama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "LLaMA" \
-t 8 -vid 0 --speak ./examples/talk-llama/speak.sh

6
talk-rrama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "RRaMA" \
-t 8 -vid 3 --speak ./examples/talk-llama/speak.sh

6
talk-ssama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "SSaMA" \
-t 8 -vid 2 --speak ./examples/talk-llama/speak.sh

View File

@ -654,9 +654,11 @@ static bool kv_cache_init(
int n_ctx) {
cache.buf.resize(mem_bytes);
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(),
/*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params);
@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(),
/*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params);
@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = wctx.model.buf->size();
params.mem_buffer = wctx.model.buf->data();
struct ggml_init_params params = {
/*.mem_size =*/ wctx.model.buf->size(),
/*.mem_buffer =*/ wctx.model.buf->data(),
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -1344,9 +1350,11 @@ static bool whisper_encode_internal(
const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels);
struct ggml_init_params params;
params.mem_size = wstate.buf_compute.size();
params.mem_buffer = wstate.buf_compute.data();
struct ggml_init_params params = {
/*.mem_size =*/ wstate.buf_compute.size(),
/*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -1797,9 +1805,11 @@ static bool whisper_decode_internal(
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
struct ggml_init_params params;
params.mem_size = wstate.buf_compute.size();
params.mem_buffer = wstate.buf_compute.data();
struct ggml_init_params params = {
/*.mem_size =*/ wstate.buf_compute.size(),
/*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -4726,6 +4736,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(gparams);