mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-11 03:31:36 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
276615d708
commit
59119f4f20
File diff suppressed because it is too large
Load Diff
@ -100,6 +100,7 @@ extern "C" {
|
|||||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
@ -112,6 +113,12 @@ extern "C" {
|
|||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_pooling_type {
|
||||||
|
LLAMA_POOLING_NONE = 0,
|
||||||
|
LLAMA_POOLING_MEAN = 1,
|
||||||
|
LLAMA_POOLING_CLS = 2,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
@ -236,6 +243,7 @@ extern "C" {
|
|||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -297,6 +305,12 @@ extern "C" {
|
|||||||
int32_t n_eval;
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// used in chat template
|
||||||
|
typedef struct llama_chat_message {
|
||||||
|
const char * role;
|
||||||
|
const char * content;
|
||||||
|
} llama_chat_message;
|
||||||
|
|
||||||
// Helpers for getting default parameters
|
// Helpers for getting default parameters
|
||||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||||
@ -305,7 +319,10 @@ extern "C" {
|
|||||||
// Initialize the llama + ggml backend
|
// Initialize the llama + ggml backend
|
||||||
// If numa is true, use NUMA optimizations
|
// If numa is true, use NUMA optimizations
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_backend_init(bool numa);
|
LLAMA_API void llama_backend_init(void);
|
||||||
|
|
||||||
|
//optional:
|
||||||
|
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||||
|
|
||||||
// Call once at the end of the program - currently only used for MPI
|
// Call once at the end of the program - currently only used for MPI
|
||||||
LLAMA_API void llama_backend_free(void);
|
LLAMA_API void llama_backend_free(void);
|
||||||
@ -628,6 +645,10 @@ extern "C" {
|
|||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_embd] (1-dimensional)
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Get the embeddings for the ith sequence
|
||||||
|
// llama_get_embeddings(ctx) + i*n_embd
|
||||||
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab
|
// Vocab
|
||||||
//
|
//
|
||||||
@ -684,6 +705,25 @@ extern "C" {
|
|||||||
char * buf,
|
char * buf,
|
||||||
int32_t length);
|
int32_t length);
|
||||||
|
|
||||||
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||||
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||||
|
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
||||||
|
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||||
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||||
|
/// @param n_msg Number of llama_chat_message in this chat
|
||||||
|
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||||
|
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
||||||
|
/// @param length The size of the allocated buffer
|
||||||
|
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||||
|
LLAMA_API int32_t llama_chat_apply_template(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * tmpl,
|
||||||
|
const struct llama_chat_message * chat,
|
||||||
|
size_t n_msg,
|
||||||
|
bool add_ass,
|
||||||
|
char * buf,
|
||||||
|
int32_t length);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Grammar
|
// Grammar
|
||||||
//
|
//
|
||||||
|
@ -288,7 +288,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// llama init
|
// llama init
|
||||||
|
|
||||||
llama_backend_init(true);
|
llama_backend_init();
|
||||||
|
|
||||||
auto lmparams = llama_model_default_params();
|
auto lmparams = llama_model_default_params();
|
||||||
if (!params.use_gpu) {
|
if (!params.use_gpu) {
|
||||||
|
@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
|||||||
offset += 1;
|
offset += 1;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x40)) {
|
if (!(utf8[offset + 0] & 0x40)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x20)) {
|
if (!(utf8[offset + 0] & 0x20)) {
|
||||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
|
}
|
||||||
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
||||||
offset += 2;
|
offset += 2;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x10)) {
|
if (!(utf8[offset + 0] & 0x10)) {
|
||||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
|
}
|
||||||
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
||||||
offset += 3;
|
offset += 3;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x08)) {
|
if (!(utf8[offset + 0] & 0x08)) {
|
||||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
|
}
|
||||||
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
||||||
offset += 4;
|
offset += 4;
|
||||||
return result;
|
return result;
|
||||||
@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
|
|||||||
offset += 1;
|
offset += 1;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
|
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
|
}
|
||||||
|
|
||||||
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||||
offset += 2;
|
offset += 2;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
throw std::invalid_argument("invalid string");
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
while (offset < utf16.size())
|
while (offset < utf16.size()) {
|
||||||
result.push_back(codepoint_from_utf16(utf16, offset));
|
result.push_back(codepoint_from_utf16(utf16, offset));
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
|
|||||||
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||||
std::unordered_map<uint32_t, int> codepoint_types;
|
std::unordered_map<uint32_t, int> codepoint_types;
|
||||||
for (auto p : digit_ranges) {
|
for (auto p : digit_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : letter_ranges) {
|
for (auto p : letter_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : whitespace_ranges) {
|
for (auto p : whitespace_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : accent_mark_ranges) {
|
for (auto p : accent_mark_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : punctuation_ranges) {
|
for (auto p : punctuation_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : symbol_ranges) {
|
for (auto p : symbol_ranges) {
|
||||||
for (auto i = p.first; i <= p.second; ++i)
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (auto p : control_ranges) {
|
for (auto p : control_ranges) {
|
||||||
for(auto i = p.first; i <= p.second; ++ i)
|
for (auto i = p.first; i <= p.second; ++ i) {
|
||||||
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return codepoint_types;
|
return codepoint_types;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int codepoint_type(uint32_t cp) {
|
static int codepoint_type(uint32_t cp) {
|
||||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||||
return codepoint_types[cp];
|
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int codepoint_type(const std::string & utf8) {
|
static int codepoint_type(const std::string & utf8) {
|
||||||
if (utf8.length() == 0)
|
if (utf8.length() == 0) {
|
||||||
return CODEPOINT_TYPE_UNIDENTIFIED;
|
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
}
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user