whisper.cpp/examples/common-ggml.cpp

#include "common-ggml.h"

#include <regex>
#include <map>

static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
    {"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
    {"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
    {"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
    {"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
    {"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
};

void ggml_print_ftypes(FILE * fp) {
    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
    }
}

enum ggml_ftype ggml_parse_ftype(const char * str) {
    enum ggml_ftype ftype;
    if (str[0] == 'q') {
        const auto it = GGML_FTYPE_MAP.find(str);
        if (it == GGML_FTYPE_MAP.end()) {
            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
            return GGML_FTYPE_UNKNOWN;
        }
        ftype = it->second;
    } else {
        ftype = (enum ggml_ftype) atoi(str);
    }

    return ftype;
}

bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {

    ggml_type qtype = GGML_TYPE_F32;

    switch (ftype) {
        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
        case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
        case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
        case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
        case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
        case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
        case GGML_FTYPE_MOSTLY_IQ2_XXS:
        case GGML_FTYPE_MOSTLY_IQ2_XS:
        case GGML_FTYPE_MOSTLY_IQ2_S:
        case GGML_FTYPE_MOSTLY_IQ3_XXS:
        case GGML_FTYPE_MOSTLY_IQ3_S:
        case GGML_FTYPE_MOSTLY_IQ1_S:
        case GGML_FTYPE_MOSTLY_IQ4_NL:
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };

    if (!ggml_is_quantized(qtype)) {
        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
        return false;
    }

    size_t total_size_org = 0;
    size_t total_size_new = 0;

    std::vector<float> work;

    std::vector<uint8_t>     data_u8;
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;

    while (true) {
        int32_t n_dims;
        int32_t length;
        int32_t ttype;

        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

        if (finp.eof()) {
            break;
        }

        int32_t nelements = 1;
        int32_t ne[4] = { 1, 1, 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
            nelements *= ne[i];
        }

        std::string name(length, 0);
        finp.read (&name[0], length);

        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));

        bool quantize = false;

        // check if we should quantize this tensor
        for (const auto & s : to_quant) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = true;
                break;
            }
        }

        // check if we should skip this tensor
        for (const auto & s : to_skip) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = false;
                break;
            }
        }

        // quantize only 2D tensors
        quantize &= (n_dims == 2);

        if (quantize) {
            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                return false;
            }

            if (ttype == GGML_TYPE_F16) {
                data_f16.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                data_f32.resize(nelements);
                for (int i = 0; i < nelements; ++i) {
                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
                }
            } else {
                data_f32.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
            }

            ttype = qtype;
        } else {
            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);

            data_u8.resize(nelements*bpe);
            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
        }

        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        for (int i = 0; i < n_dims; ++i) {
            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }
        fout.write(&name[0], length);

        if (quantize) {
            work.resize(nelements); // for quantization

            size_t cur_size = 0;
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q5_0:
                case GGML_TYPE_Q5_1:
                case GGML_TYPE_Q8_0:
                case GGML_TYPE_Q2_K:
                case GGML_TYPE_Q3_K:
                case GGML_TYPE_Q4_K:
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q6_K:
                    {
                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_I64:
                case GGML_TYPE_F64:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q8_K:
                case GGML_TYPE_IQ2_XXS:
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
                case GGML_TYPE_IQ3_XXS:
                case GGML_TYPE_IQ3_S:
                case GGML_TYPE_IQ1_S:
                case GGML_TYPE_IQ4_NL:
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_Q4_0_4_4:
                case GGML_TYPE_Q4_0_4_8:
                case GGML_TYPE_Q4_0_8_8:
                case GGML_TYPE_TQ1_0:
                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                        return false;
                    }
            }

            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;

            printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
            total_size_new += data_u8.size();
        }

        total_size_org += nelements * sizeof(float);
    }

    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

    return true;
}
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`#include "common-ggml.h"`

			`#include <regex>`
			`#include <map>`

			`static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {`
			`{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},`
			`{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},`
			`{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},`
			`{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},`
			`{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},`
quantize : add support for K-quant types 2023-11-16 14:18:24 +00:00			`{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},`
			`{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},`
			`{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},`
			`{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},`
			`{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`};`

			`void ggml_print_ftypes(FILE * fp) {`
			`for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {`
			`fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);`
			`}`
			`}`

			`enum ggml_ftype ggml_parse_ftype(const char * str) {`
			`enum ggml_ftype ftype;`
			`if (str[0] == 'q') {`
			`const auto it = GGML_FTYPE_MAP.find(str);`
			`if (it == GGML_FTYPE_MAP.end()) {`
			`fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);`
			`return GGML_FTYPE_UNKNOWN;`
			`}`
			`ftype = it->second;`
			`} else {`
			`ftype = (enum ggml_ftype) atoi(str);`
			`}`

			`return ftype;`
			`}`

			`bool ggml_common_quantize_0(`
			`std::ifstream & finp,`
			`std::ofstream & fout,`
			`const ggml_ftype ftype,`
			`const std::vector<std::string> & to_quant,`
			`const std::vector<std::string> & to_skip) {`

			`ggml_type qtype = GGML_TYPE_F32;`

			`switch (ftype) {`
			`case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;`
			`case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;`
			`case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;`
			`case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;`
			`case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;`
quantize : add support for K-quant types 2023-11-16 14:18:24 +00:00			`case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;`
			`case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;`
			`case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;`
			`case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;`
			`case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`case GGML_FTYPE_UNKNOWN:`
			`case GGML_FTYPE_ALL_F32:`
			`case GGML_FTYPE_MOSTLY_F16:`
			`case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:`
sync : llama.cpp 2024-01-11 19:49:13 +00:00			`case GGML_FTYPE_MOSTLY_IQ2_XXS:`
			`case GGML_FTYPE_MOSTLY_IQ2_XS:`
sync : llama.cpp (ggml/0) 2024-02-28 10:59:11 +00:00			`case GGML_FTYPE_MOSTLY_IQ2_S:`
sync : ggml (#0) 2024-01-30 19:30:26 +00:00			`case GGML_FTYPE_MOSTLY_IQ3_XXS:`
sync : llama.cpp (ggml/0) 2024-02-25 17:58:06 +00:00			`case GGML_FTYPE_MOSTLY_IQ3_S:`
common : add IQ1_S (ggml/0) ggml-ci 2024-02-19 13:27:37 +00:00			`case GGML_FTYPE_MOSTLY_IQ1_S:`
sync : llama.cpp (ggml/0) ggml-ci 2024-02-21 14:19:39 +00:00			`case GGML_FTYPE_MOSTLY_IQ4_NL:`
sync : llama.cpp (ggml/0) 2024-02-28 10:59:11 +00:00			`case GGML_FTYPE_MOSTLY_IQ4_XS:`
sync : ggml (#2001) * sync : update scripts * sync : ggml * talk-llama : sync llama.cpp * make : WHISPER_CUBLAS -> WHISPER_CUDA * ci : try to fix sycl build * talk-llama : fix make build 2024-03-27 16:55:10 +00:00			`case GGML_FTYPE_MOSTLY_IQ1_M:`
ggml : resolve merge (ggml/0) ggml-ci 2024-05-11 13:25:50 +00:00			`case GGML_FTYPE_MOSTLY_BF16:`
common : handle new quant types (ggml/0) 2024-07-27 14:17:04 +00:00			`case GGML_FTYPE_MOSTLY_Q4_0_4_4:`
			`case GGML_FTYPE_MOSTLY_Q4_0_4_8:`
			`case GGML_FTYPE_MOSTLY_Q4_0_8_8:`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`{`
			`fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);`
			`return false;`
			`}`
			`};`

			`if (!ggml_is_quantized(qtype)) {`
			`fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));`
			`return false;`
			`}`

			`size_t total_size_org = 0;`
			`size_t total_size_new = 0;`

			`std::vector<float> work;`

			`std::vector<uint8_t> data_u8;`
			`std::vector<ggml_fp16_t> data_f16;`
			`std::vector<float> data_f32;`

			`while (true) {`
			`int32_t n_dims;`
			`int32_t length;`
			`int32_t ttype;`

			`finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));`
			`finp.read(reinterpret_cast<char *>(&length), sizeof(length));`
			`finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));`

			`if (finp.eof()) {`
			`break;`
			`}`

			`int32_t nelements = 1;`
whisper : fix quantize bug (#842) * whisper : debug * whisper : fix bug during quantization 2023-04-30 19:50:04 +00:00			`int32_t ne[4] = { 1, 1, 1, 1 };`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`for (int i = 0; i < n_dims; ++i) {`
			`finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));`
			`nelements *= ne[i];`
			`}`

			`std::string name(length, 0);`
			`finp.read (&name[0], length);`

whisper : fix quantize bug (#842) * whisper : debug * whisper : fix bug during quantization 2023-04-30 19:50:04 +00:00			`printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00
			`bool quantize = false;`

			`// check if we should quantize this tensor`
			`for (const auto & s : to_quant) {`
			`if (std::regex_match(name, std::regex(s))) {`
			`quantize = true;`
			`break;`
			`}`
			`}`

			`// check if we should skip this tensor`
			`for (const auto & s : to_skip) {`
			`if (std::regex_match(name, std::regex(s))) {`
			`quantize = false;`
			`break;`
			`}`
			`}`

			`// quantize only 2D tensors`
			`quantize &= (n_dims == 2);`

			`if (quantize) {`
			`if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {`
			`fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));`
			`return false;`
			`}`

			`if (ttype == GGML_TYPE_F16) {`
			`data_f16.resize(nelements);`
			`finp.read(reinterpret_cast<char >(data_f16.data()), nelements sizeof(ggml_fp16_t));`
			`data_f32.resize(nelements);`
			`for (int i = 0; i < nelements; ++i) {`
			`data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);`
			`}`
			`} else {`
			`data_f32.resize(nelements);`
			`finp.read(reinterpret_cast<char >(data_f32.data()), nelements sizeof(float));`
			`}`

			`ttype = qtype;`
			`} else {`
			`const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);`

			`data_u8.resize(nelements*bpe);`
			`finp.read(reinterpret_cast<char >(data_u8.data()), nelements bpe);`
			`}`

			`fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));`
			`fout.write(reinterpret_cast<char *>(&length), sizeof(length));`
			`fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));`
			`for (int i = 0; i < n_dims; ++i) {`
			`fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));`
			`}`
			`fout.write(&name[0], length);`

			`if (quantize) {`
			`work.resize(nelements); // for quantization`

			`size_t cur_size = 0;`
			`switch ((ggml_type) ttype) {`
			`case GGML_TYPE_Q4_0:`
			`case GGML_TYPE_Q4_1:`
			`case GGML_TYPE_Q5_0:`
			`case GGML_TYPE_Q5_1:`
			`case GGML_TYPE_Q8_0:`
quantize : add support for K-quant types 2023-11-16 14:18:24 +00:00			`case GGML_TYPE_Q2_K:`
			`case GGML_TYPE_Q3_K:`
			`case GGML_TYPE_Q4_K:`
			`case GGML_TYPE_Q5_K:`
			`case GGML_TYPE_Q6_K:`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`{`
update examples and tests 2024-03-14 15:45:27 +00:00			`cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`} break;`
			`case GGML_TYPE_F32:`
			`case GGML_TYPE_F16:`
			`case GGML_TYPE_I8:`
			`case GGML_TYPE_I16:`
			`case GGML_TYPE_I32:`
sync : ggml (#2001) * sync : update scripts * sync : ggml * talk-llama : sync llama.cpp * make : WHISPER_CUBLAS -> WHISPER_CUDA * ci : try to fix sycl build * talk-llama : fix make build 2024-03-27 16:55:10 +00:00			`case GGML_TYPE_I64:`
			`case GGML_TYPE_F64:`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`case GGML_TYPE_Q8_1:`
ggml : sync latest ggml lib 2023-06-25 11:22:21 +00:00			`case GGML_TYPE_Q8_K:`
sync : ggml 2024-01-11 19:54:17 +00:00			`case GGML_TYPE_IQ2_XXS:`
			`case GGML_TYPE_IQ2_XS:`
sync : llama.cpp (ggml/0) 2024-02-28 10:59:11 +00:00			`case GGML_TYPE_IQ2_S:`
sync : ggml (#0) 2024-01-30 19:30:26 +00:00			`case GGML_TYPE_IQ3_XXS:`
sync : llama.cpp (ggml/0) 2024-02-25 17:58:06 +00:00			`case GGML_TYPE_IQ3_S:`
common : add IQ1_S (ggml/0) ggml-ci 2024-02-19 13:27:37 +00:00			`case GGML_TYPE_IQ1_S:`
sync : llama.cpp (ggml/0) ggml-ci 2024-02-21 14:19:39 +00:00			`case GGML_TYPE_IQ4_NL:`
sync : llama.cpp (ggml/0) 2024-02-28 10:59:11 +00:00			`case GGML_TYPE_IQ4_XS:`
sync : ggml (#2001) * sync : update scripts * sync : ggml * talk-llama : sync llama.cpp * make : WHISPER_CUBLAS -> WHISPER_CUDA * ci : try to fix sycl build * talk-llama : fix make build 2024-03-27 16:55:10 +00:00			`case GGML_TYPE_IQ1_M:`
ggml : resolve merge (ggml/0) ggml-ci 2024-05-11 13:25:50 +00:00			`case GGML_TYPE_BF16:`
common : handle new quant types (ggml/0) 2024-07-27 14:17:04 +00:00			`case GGML_TYPE_Q4_0_4_4:`
			`case GGML_TYPE_Q4_0_4_8:`
			`case GGML_TYPE_Q4_0_8_8:`
examples : adapt to ggml.h changes (ggml/0) ggml-ci 2024-09-20 18:50:16 +00:00			`case GGML_TYPE_TQ1_0:`
			`case GGML_TYPE_TQ2_0:`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`case GGML_TYPE_COUNT:`
			`{`
			`fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));`
			`return false;`
			`}`
			`}`

			`fout.write(reinterpret_cast<char *>(work.data()), cur_size);`
			`total_size_new += cur_size;`

update examples and tests 2024-03-14 15:45:27 +00:00			`printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);`
whisper : add integer quantization support (#540) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples 2023-04-30 15:51:57 +00:00			`} else {`
			`printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);`
			`fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());`
			`total_size_new += data_u8.size();`
			`}`

			`total_size_org += nelements * sizeof(float);`
			`}`

			`printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);`
			`printf("%s: quant size = %8.2f MB \| ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));`

			`return true;`
			`}`