mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-02-21 09:21:49 +00:00
Refactor lora adapter support (llama/8332)
* lora: load to devide buft * add patch tensor function * correct tensor patch * llama_lora_adapter_apply * correct ggml_backend_tensor_copy * add llm_build_mm * fix auto merge * update based on review comments * add convert script * no more transpose A * add f16 convert * add metadata check * add sanity check * fix ftype * add requirements * fix requirements * fix outfile * conversion: only allow selected models * fix types * cuda : do not use dmmv if the tensor does not have enough cols * llama : lora fixes * do not disable mmap with lora Co-authored-by: slaren <slarengh@gmail.com> * llm_build_lora_mm_id * convert_lora : MoE LoRA conversion support * convert_lora : prefer safetensors, similarly to convert_hf * convert_hf : simplify modify_tensors for InternLM2 * convert_lora : lazy conversion * llama : load and use alpha from LoRA adapters * llama : use llm_build_lora_mm in most model graphs * auto scale * Revert "auto scale" This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464. * remove redundant params * Apply suggestions from code review Co-authored-by: slaren <slarengh@gmail.com> * change kv metadata * move add_type to __init__ * convert_hf : move add_type to main() * convert_lora : use the GGUFWriter from Model instead of overwriting it --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Francis Couture-Harpin <git@compilade.net>
This commit is contained in:
parent
3e94c7a81d
commit
8807fe608b
@ -1876,7 +1876,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||
&& src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
@ -19477,7 +19477,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
|
||||
fprintf(fp, "digraph G {\n");
|
||||
fprintf(fp, " newrank = true;\n");
|
||||
fprintf(fp, " rankdir = LR;\n");
|
||||
fprintf(fp, " rankdir = TB;\n");
|
||||
|
||||
for (int i = 0; i < gb->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gb->nodes[i];
|
||||
@ -19539,7 +19539,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
}
|
||||
|
||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||
if (ggml_nelements(node) < 5) {
|
||||
if (ggml_nelements(node) < 5 && node->data != NULL) {
|
||||
fprintf(fp, " | (");
|
||||
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user