diff --git a/ggml-quants.c b/ggml-quants.c index 5cb7ec62..6cd12f70 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1704,16 +1704,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; - quantize_row_q2_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q2_K)); -} - static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights, uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, float rmin, float rdelta, int nstep, bool use_mad) { @@ -1966,8 +1956,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); if (!quant_weights) { quantize_row_q2_K_reference(src, dst, nrow*n_per_row); @@ -2186,16 +2175,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; - quantize_row_q3_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q3_K)); -} - static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2285,8 +2264,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri #endif } -size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); if (!quant_weights) { quantize_row_q3_K_reference(src, dst, nrow*n_per_row); @@ -2456,17 +2434,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; - quantize_row_q4_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q4_K)); -} - static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2545,8 +2512,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri #endif } -size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); if (!quant_weights) { quantize_row_q4_K_reference(src, dst, nrow*n_per_row); @@ -2757,17 +2723,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; - quantize_row_q5_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q5_K)); -} - static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2866,8 +2821,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri #endif } -size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); if (!quant_weights) { quantize_row_q5_K_reference(src, dst, nrow*n_per_row); @@ -3020,17 +2974,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q6_K_reference(x, y, k); } -size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; - quantize_row_q6_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q6_K)); -} - static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -3120,8 +3063,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri #endif } -size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); if (!quant_weights) { quantize_row_q6_K_reference(src, dst, nrow*n_per_row); @@ -3165,9 +3107,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri } } -size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q4_0_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row); char * qrow = (char *)dst; @@ -3209,9 +3152,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri } } -size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q4_1_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row); char * qrow = (char *)dst; @@ -3262,9 +3206,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri } } -size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q5_0_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row); char * qrow = (char *)dst; @@ -3314,9 +3259,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri } } -size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q5_1_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row); char * qrow = (char *)dst; @@ -3328,6 +3274,13 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int return nrow * row_size; } +size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); + quantize_row_q8_0_reference(src, dst, nrow*n_per_row); + return nrow * row_size; +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { @@ -9373,7 +9326,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -9621,7 +9574,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #endif -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -10221,7 +10174,7 @@ void iq2xs_init_impl(enum ggml_type type) { int * kmap_q2xs; uint16_t * kneighbors_q2xs; - printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t)); for (int k = 0; k < grid_size; ++k) { int8_t * pos = (int8_t *)(the_grid + k); @@ -10276,7 +10229,7 @@ void iq2xs_init_impl(enum ggml_type type) { } num_neighbors += n; } - printf("%s: %d neighbours in total\n", __func__, num_neighbors); + //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq2_data[gindex].neighbours = kneighbors_q2xs; int counter = 0; @@ -10699,8 +10652,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } } -size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -10712,8 +10664,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, return nrow * nblock * sizeof(block_iq2_xxs); } -size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -10817,7 +10768,7 @@ void iq3xs_init_impl(int grid_size) { int * kmap_q3xs; uint16_t * kneighbors_q3xs; - printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t)); for (int k = 0; k < grid_size; ++k) { int8_t * pos = (int8_t *)(the_grid + k); @@ -10872,7 +10823,7 @@ void iq3xs_init_impl(int grid_size) { } num_neighbors += n; } - printf("%s: %d neighbours in total\n", __func__, num_neighbors); + //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq3_data[gindex].neighbours = kneighbors_q3xs; int counter = 0; @@ -11155,8 +11106,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } } -size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11362,8 +11312,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } #define IQ3S_BLOCK_SIZE 32 -size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; float scales[QK_K/IQ3S_BLOCK_SIZE]; @@ -11393,7 +11342,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq3_s(x, y, 1, k, NULL, NULL); + quantize_iq3_s(x, y, 1, k, NULL); } @@ -11588,8 +11537,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11614,7 +11562,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) { return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } -static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x, +static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, float * scales, float * weight, uint8_t * L, const int8_t * values, @@ -11722,8 +11670,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } -size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK4_NL == 0); int nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; @@ -11753,14 +11700,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { assert(k % QK4_NL == 0); - quantize_iq4_nl(x, y, 1, k, NULL, NULL); + quantize_iq4_nl(x, y, 1, k, NULL); } -size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { #if QK_K == 64 - return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights); + return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights); #else - (void)hist; GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11789,7 +11735,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq4_xs(x, y, 1, k, NULL, NULL); + quantize_iq4_xs(x, y, 1, k, NULL); } // =============================== 2.5625 bpw @@ -11962,8 +11908,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11977,7 +11922,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq2_s(x, y, 1, k, NULL, NULL); + quantize_iq2_s(x, y, 1, k, NULL); } void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) { diff --git a/ggml-quants.h b/ggml-quants.h index d2d25eb4..47dd5285 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -261,6 +261,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); + void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k); @@ -280,6 +281,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -300,6 +302,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -321,6 +324,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -330,26 +334,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -// // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -// -size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); + +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5a1b3f47..d41aa7d2 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -4102,45 +4102,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool } static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { - std::vector hist_cur(1 << 4, 0); - - switch(quant) { - case GGML_TYPE_F32: - memcpy(to, from, sizeof(float) * ne); - break; - case GGML_TYPE_Q4_0: - ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q4_1: - ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_0: - ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_1: - ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q8_0: - ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q2_K: - ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q3_K: - ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q4_K: - ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_K: - ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q6_K: - ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data()); - break; - default: - GGML_ASSERT(false); - } + ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { diff --git a/ggml.c b/ggml.c index 6eff98ab..80efa6f2 100644 --- a/ggml.c +++ b/ggml.c @@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) { ggml_critical_section_end(); } -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_0 == 0); - const int nb = k / QK4_0; - - for (int b = 0; b < n; b += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; - - quantize_row_q4_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK4_0; j += 2) { - const uint8_t vi0 = y[i].qs[j/2] & 0x0F; - const uint8_t vi1 = y[i].qs[j/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_0*sizeof(block_q4_0)); -} - -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_1 == 0); - const int nb = k / QK4_1; - - for (int b = 0; b < n; b += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; - - quantize_row_q4_1_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK4_1; j += 2) { - const uint8_t vi0 = y[i].qs[j/2] & 0x0F; - const uint8_t vi1 = y[i].qs[j/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_1*sizeof(block_q4_1)); -} - -size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK5_0 == 0); - const int nb = k / QK5_0; - - for (int b = 0; b < n; b += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; - - quantize_row_q5_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, &y[i].qh, sizeof(qh)); - - for (int j = 0; j < QK5_0; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); - - // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK5_0*sizeof(block_q5_0)); -} - -size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK5_1 == 0); - const int nb = k / QK5_1; - - for (int b = 0; b < n; b += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; - - quantize_row_q5_1_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, &y[i].qh, sizeof(qh)); - - for (int j = 0; j < QK5_1; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); - - // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK5_1*sizeof(block_q5_1)); -} - -size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - for (int b = 0; b < n; b += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; - - quantize_row_q8_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK8_0; ++j) { - const int8_t vi = y[i].qs[j]; - - hist[vi/16 + 8]++; - } - } - } - - return (n/QK8_0*sizeof(block_q8_0)); -} - bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || @@ -20293,177 +20166,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { type == GGML_TYPE_IQ1_S; } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, - int nrows, int n_per_row, int64_t * hist, const float * imatrix) { +size_t ggml_quantize_chunk( + enum ggml_type type, + const float * src, + void * dst, + int start, + int nrows, + int n_per_row, + const float * imatrix) { + const int n = nrows * n_per_row; + + if (ggml_quantize_requires_imatrix(type)) { + GGML_ASSERT(imatrix != NULL); + } + + GGML_ASSERT(start % type_traits[type].blck_size == 0); + GGML_ASSERT(start % n_per_row == 0); + ggml_quantize_init(type); // this is noop if already initialized + + const size_t start_row = start / n_per_row; + const size_t row_size = ggml_row_size(type, n_per_row); + size_t result = 0; - int n = nrows * n_per_row; + switch (type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(start % QK4_0 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(start % QK4_1 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_0: - { - GGML_ASSERT(start % QK5_0 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_1: - { - GGML_ASSERT(start % QK5_1 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(start % QK8_0 == 0); - block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; - result = ggml_quantize_q8_0(src + start, block, n, n, hist); - } break; - case GGML_TYPE_Q2_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q3_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q4_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q6_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_XXS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - GGML_ASSERT(imatrix); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_XS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - GGML_ASSERT(imatrix); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ3_XXS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ3_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ1_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #if QK_K == 64 - case GGML_TYPE_IQ4_XS: -#endif - { - GGML_ASSERT(start % QK4_NL == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; -#if QK_K != 64 - case GGML_TYPE_IQ4_XS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; + case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; +#else + case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #endif case GGML_TYPE_F16: { @@ -20480,6 +20228,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i default: assert(false); } + + GGML_ASSERT(result == nrows * row_size); + return result; } diff --git a/ggml.h b/ggml.h index a13b0cec..1171088a 100644 --- a/ggml.h +++ b/ggml.h @@ -2194,25 +2194,18 @@ extern "C" { GGML_API void ggml_quantize_init(enum ggml_type type); GGML_API void ggml_quantize_free(void); - // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk - GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); - - GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - // some quantization type cannot be used without an importance matrix GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type); // calls ggml_quantize_init internally (i.e. can allocate memory) - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, - int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix); + GGML_API size_t ggml_quantize_chunk( + enum ggml_type type, + const float * src, + void * dst, + int start, + int nrows, + int n_per_row, + const float * imatrix); // // gguf