mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-17 14:28:07 +00:00
ggml : group all experts in a single ggml_mul_mat_id (llama/6505)
* ggml : group all experts in a single ggml_mul_mat_id cuda : improve mmid row copy * cuda : fix bin bcast with non-cont src0 * test-backend-ops : only run all mul mat tests for base types * llama : disable moe offloading with SYCL --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
125
ggml.c
125
ggml.c
@ -4594,21 +4594,32 @@ void ggml_mul_mat_set_prec(
|
||||
|
||||
// ggml_mul_mat_id
|
||||
|
||||
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
|
||||
// this will allow computing all the used experts in a single matrix multiplication
|
||||
/*
|
||||
c = ggml_mul_mat_id(ctx, as, b, ids);
|
||||
|
||||
as -> [cols, rows, n_expert]
|
||||
ids -> [n_experts_used, n_tokens] (i32)
|
||||
b -> [cols, n_expert_used, n_tokens]
|
||||
c -> [cols, n_expert_used, n_tokens]
|
||||
|
||||
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
||||
|
||||
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
||||
*/
|
||||
struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as,
|
||||
struct ggml_tensor * ids,
|
||||
int id,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids) {
|
||||
GGML_ASSERT(!ggml_is_transposed(as));
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
||||
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
||||
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
||||
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
|
||||
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
||||
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
||||
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
||||
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
||||
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
@ -4616,11 +4627,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, id);
|
||||
|
||||
result->op = GGML_OP_MUL_MAT_ID;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = as;
|
||||
@ -11071,11 +11080,6 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||
@ -11086,22 +11090,21 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
// broadcast is not supported with mmid
|
||||
assert(ne12 == 1);
|
||||
assert(ne13 == 1);
|
||||
|
||||
// row groups
|
||||
const int id = ggml_get_op_params_i32(dst, 0);
|
||||
const int n_as = src0->ne[2];
|
||||
const int n_ids = ids->ne[0]; // n_expert_used
|
||||
const int n_as = ne02; // n_expert
|
||||
|
||||
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
||||
(char *) params->wdata :
|
||||
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
||||
|
||||
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
||||
struct mmid_row_mapping {
|
||||
int32_t i1;
|
||||
int32_t i2;
|
||||
};
|
||||
|
||||
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
||||
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
if (ith != 0) {
|
||||
@ -11127,13 +11130,18 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
// initialize matrix_row_counts
|
||||
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
||||
|
||||
// group rows by src0 matrix
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
||||
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
||||
matrix_row_counts[row_id] += 1;
|
||||
// group rows by src0 matrix
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
||||
for (int id = 0; id < n_ids; ++id) {
|
||||
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
assert(i02 >= 0 && i02 < n_as);
|
||||
|
||||
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
||||
matrix_row_counts[i02] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
@ -11151,15 +11159,13 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t src0_offset = cur_a*src0->nb[2];
|
||||
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
||||
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
||||
|
||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = cne1; // src1 rows
|
||||
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
||||
@ -11178,13 +11184,11 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
const int64_t ir110 = dr1*ith1;
|
||||
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
||||
|
||||
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
||||
|
||||
// threads with no work simply yield (not sure if it helps)
|
||||
if (ir010 >= ir011 || ir110 >= ir111) {
|
||||
sched_yield();
|
||||
continue;
|
||||
}
|
||||
//if (ir010 >= ir011 || ir110 >= ir111) {
|
||||
// sched_yield();
|
||||
// continue;
|
||||
//}
|
||||
|
||||
// block-tiling attempt
|
||||
const int64_t blck_0 = 16;
|
||||
@ -11196,20 +11200,16 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
||||
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
||||
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
||||
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
||||
const int64_t _i12 = ir1; // logical row index for this expert
|
||||
|
||||
// broadcast src0 into src1
|
||||
//const int64_t i03 = i13/r3;
|
||||
//const int64_t i02 = i12/r2;
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
||||
const int id = row_mapping.i1; // selected expert index
|
||||
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = row_mapping.i2; // row index in src1
|
||||
|
||||
const char * src0_row = (const char *) src0->data + src0_offset;
|
||||
const int64_t i1 = id; // selected expert index
|
||||
const int64_t i2 = i12; // row
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
@ -11217,25 +11217,26 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
? (i11 + i12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12));
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
||||
|
||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
||||
}
|
||||
|
||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef MMID_MATRIX_ROW
|
||||
#undef MMID_MATRIX_ROW
|
||||
}
|
||||
|
||||
// ggml_compute_forward_out_prod
|
||||
@ -18583,7 +18584,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
const int n_as = src0->ne[2];
|
||||
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
||||
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
||||
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
||||
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
@ -21009,12 +21010,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
ok = ok && cur != NULL;
|
||||
|
||||
ggml_set_name(cur, ctx->infos[i].name.data);
|
||||
|
||||
if (!ok) {
|
||||
break;
|
||||
}
|
||||
|
||||
ggml_set_name(cur, ctx->infos[i].name.data);
|
||||
|
||||
// point the data member to the appropriate location in the binary blob using the tensor infos
|
||||
if (!params.no_alloc) {
|
||||
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
||||
|
Reference in New Issue
Block a user