whisper.cpp/ggml-cuda/quantize.cuh

#pragma once

#include "common.cuh"
#include "mmq.cuh"

#include <cstdint>

#define CUDA_QUANTIZE_BLOCK_SIZE 256

typedef void (*quantize_cuda_t)(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);

void quantize_row_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);

void quantize_mmq_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);
CUDA: revise q8_1 data layout for mul_mat_q (llama/7824) 2024-06-09 07:42:25 +00:00			`#pragma once`

sync : ggml (#2001) * sync : update scripts * sync : ggml * talk-llama : sync llama.cpp * make : WHISPER_CUBLAS -> WHISPER_CUDA * ci : try to fix sycl build * talk-llama : fix make build 2024-03-27 16:55:10 +00:00			`#include "common.cuh"`
CUDA: revise q8_1 data layout for mul_mat_q (llama/7824) 2024-06-09 07:42:25 +00:00			`#include "mmq.cuh"`

			`#include <cstdint>`
sync : ggml (#2001) * sync : update scripts * sync : ggml * talk-llama : sync llama.cpp * make : WHISPER_CUBLAS -> WHISPER_CUDA * ci : try to fix sycl build * talk-llama : fix make build 2024-03-27 16:55:10 +00:00
			`#define CUDA_QUANTIZE_BLOCK_SIZE 256`

CUDA: revise q8_1 data layout for mul_mat_q (llama/7824) 2024-06-09 07:42:25 +00:00			`typedef void (*quantize_cuda_t)(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`

			`void quantize_row_q8_1_cuda(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`

			`void quantize_mmq_q8_1_cuda(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`