From d4353e48f77214e0a63b2980c5f23583db5a9e3d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Nov 2023 10:00:07 +0200 Subject: [PATCH] sync : ggml (ggml-alloc + linker + gguf fixes) (#1501) --- ggml-alloc.c | 23 ++-- ggml-quants.c | 5 + ggml.c | 341 ++++++++++++-------------------------------------- ggml.h | 5 + 4 files changed, 105 insertions(+), 269 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index f400dc56..cdfe4caf 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)]; } -static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) { +static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) { ggml_tallocr_t alloc = node_tallocr(galloc, view); //printf("init_view: %s from src %s\n", view->name, view->view_src->name); GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); - view->backend = view->view_src->backend; + if (update_backend) { + view->backend = view->view_src->backend; + } view->buffer = view->view_src->buffer; view->data = (char *)view->view_src->data + view->view_offs; @@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { if (node->data == NULL) { if (ggml_is_view(node)) { - init_view(galloc, node); + init_view(galloc, node, true); } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); node->view_src = view_src; view_src_hn->n_views += 1; - init_view(galloc, node); + init_view(galloc, node, false); return; } - } - else { + } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->view_src = parent; p_hn->n_views += 1; - init_view(galloc, node); + init_view(galloc, node, false); return; } } @@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr hash_get(galloc, view_src)->n_views += 1; if (node->buffer == NULL && node->data != NULL) { // view of a pre-allocated tensor, didn't call init_view() yet - init_view(galloc, node); + init_view(galloc, node, true); } } @@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } hash_get(galloc, parent)->n_children += 1; if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(galloc, parent); + init_view(galloc, parent, true); } } } @@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st return max_size; } -void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) { +void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) { const size_t hash_size = hash_set.size; GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); @@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap // reset hash values memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - galloc->hash_allocs = hash_node_alloct; + galloc->hash_allocs = hash_node_talloc; ggml_tallocr_alloc_graph_impl(galloc, graph); diff --git a/ggml-quants.c b/ggml-quants.c index a48eda73..cf2860b8 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f float max = x[0]; float sum_w = weights[0]; float sum_x = sum_w * x[0]; +#ifdef HAVE_BUGGY_APPLE_LINKER + // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 + for (volatile int i = 1; i < n; ++i) { +#else for (int i = 1; i < n; ++i) { +#endif if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; float w = weights[i]; diff --git a/ggml.c b/ggml.c index 584ee468..ada1067d 100644 --- a/ggml.c +++ b/ggml.c @@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down) { GGML_ASSERT(ggml_is_vector(b)); @@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE_BACK; @@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif - static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst, + const bool forward) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32( float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; theta_base *= theta_scale; block_theta *= theta_scale; @@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32( rope_yarn( theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; @@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32( theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst, + const bool forward) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16( float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; theta_base *= theta_scale; block_theta *= theta_scale; @@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16( rope_yarn( theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16( theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); + sin_theta *= sin_sign; theta_base *= theta_scale; @@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst); + ggml_compute_forward_rope_f16(params, src0, src1, dst, true); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst); + ggml_compute_forward_rope_f32(params, src0, src1, dst, true); } break; default: { @@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope( // ggml_compute_forward_rope_back -static void ggml_compute_forward_rope_back_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - float freq_base; - float freq_scale; - - // these two only relevant for xPos RoPE: - float xpos_base; - bool xpos_down; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - - GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta_base = freq_scale * (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; - if (xpos_down) zeta = 1.0f / zeta; - - theta_base *= theta_scale; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[1]; - - dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; - dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[n_dims/2]; - - dx[0] = dy0*cos_theta + dy1*sin_theta; - dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta; - } - } - } - } - } - } -} - -static void ggml_compute_forward_rope_back_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - - GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(10000.0, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta_base = (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = GGML_FP16_TO_FP32(dy[0]); - const float dy1 = GGML_FP16_TO_FP32(dy[1]); - - dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); - - theta_base *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = GGML_FP16_TO_FP32(dy[0]); - const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]); - - dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } - } - } - } - } -} - static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + ggml_compute_forward_rope_f16(params, src0, src1, dst, false); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + ggml_compute_forward_rope_f32(params, src0, src1, dst, false); } break; default: { @@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = ggml_add_or_set(ctx, src0->grad, @@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor n_dims, mode, n_ctx, + n_orig_ctx, freq_base, freq_scale, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down), zero_table); @@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = ggml_add_or_set(ctx, src0->grad, @@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1, n_dims, mode, - 0, n_ctx, + n_orig_ctx, freq_base, freq_scale, - 0.0f, - 1.0f, - 0.0f, - 0.0f, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down, false), @@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p { ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); - for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; //fprintf(stderr, "%s: reading kv %d\n", __func__, i); @@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_STRING: { kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); - for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); } } break; @@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p { ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; for (int j = 0; j < GGML_MAX_DIMS; ++j) { @@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // compute the total size of the data section, taking into account the alignment { ctx->size = 0; - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; const int64_t ne = @@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ggml_set_no_alloc(ctx_data, true); // create the tensors - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { const int64_t ne[GGML_MAX_DIMS] = { ctx->infos[i].ne[0], ctx->infos[i].ne[1], diff --git a/ggml.h b/ggml.h index 52ae6755..8e6b6460 100644 --- a/ggml.h +++ b/ggml.h @@ -1371,8 +1371,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down);