mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-20 05:07:52 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
276615d708
commit
59119f4f20
File diff suppressed because it is too large
Load Diff
@ -100,6 +100,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
@ -112,6 +113,12 @@ extern "C" {
|
||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
};
|
||||
|
||||
enum llama_pooling_type {
|
||||
LLAMA_POOLING_NONE = 0,
|
||||
LLAMA_POOLING_MEAN = 1,
|
||||
LLAMA_POOLING_CLS = 2,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
@ -236,6 +243,7 @@ extern "C" {
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embedding; // embedding mode only
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@ -297,6 +305,12 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// used in chat template
|
||||
typedef struct llama_chat_message {
|
||||
const char * role;
|
||||
const char * content;
|
||||
} llama_chat_message;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
@ -305,7 +319,10 @@ extern "C" {
|
||||
// Initialize the llama + ggml backend
|
||||
// If numa is true, use NUMA optimizations
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(bool numa);
|
||||
LLAMA_API void llama_backend_init(void);
|
||||
|
||||
//optional:
|
||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
@ -628,6 +645,10 @@ extern "C" {
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith sequence
|
||||
// llama_get_embeddings(ctx) + i*n_embd
|
||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
//
|
||||
// Vocab
|
||||
//
|
||||
@ -684,6 +705,25 @@ extern "C" {
|
||||
char * buf,
|
||||
int32_t length);
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||
/// @param n_msg Number of llama_chat_message in this chat
|
||||
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
||||
/// @param length The size of the allocated buffer
|
||||
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||
LLAMA_API int32_t llama_chat_apply_template(
|
||||
const struct llama_model * model,
|
||||
const char * tmpl,
|
||||
const struct llama_chat_message * chat,
|
||||
size_t n_msg,
|
||||
bool add_ass,
|
||||
char * buf,
|
||||
int32_t length);
|
||||
|
||||
//
|
||||
// Grammar
|
||||
//
|
||||
|
@ -288,7 +288,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// llama init
|
||||
|
||||
llama_backend_init(true);
|
||||
llama_backend_init();
|
||||
|
||||
auto lmparams = llama_model_default_params();
|
||||
if (!params.use_gpu) {
|
||||
|
@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x40)) {
|
||||
if (!(utf8[offset + 0] & 0x40)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x20)) {
|
||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x20)) {
|
||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x10)) {
|
||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x10)) {
|
||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
||||
offset += 3;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x08)) {
|
||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x08)) {
|
||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
||||
offset += 4;
|
||||
return result;
|
||||
@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else {
|
||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
|
||||
|
||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
|
||||
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
throw std::invalid_argument("invalid string");
|
||||
}
|
||||
|
||||
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
||||
std::vector<uint32_t> result;
|
||||
size_t offset = 0;
|
||||
while (offset < utf16.size())
|
||||
while (offset < utf16.size()) {
|
||||
result.push_back(codepoint_from_utf16(utf16, offset));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
|
||||
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
std::unordered_map<uint32_t, int> codepoint_types;
|
||||
for (auto p : digit_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
||||
}
|
||||
for(auto p : letter_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
}
|
||||
for (auto p : letter_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
||||
}
|
||||
for(auto p : whitespace_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
}
|
||||
for (auto p : whitespace_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||
}
|
||||
for(auto p : accent_mark_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
}
|
||||
for (auto p : accent_mark_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||
}
|
||||
for(auto p : punctuation_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
}
|
||||
for (auto p : punctuation_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||
}
|
||||
}
|
||||
for (auto p : symbol_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++i)
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||
}
|
||||
for(auto p : control_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
}
|
||||
for (auto p : control_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||
}
|
||||
}
|
||||
return codepoint_types;
|
||||
}
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types[cp];
|
||||
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
if (utf8.length() == 0)
|
||||
if (utf8.length() == 0) {
|
||||
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
}
|
||||
size_t offset = 0;
|
||||
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user