diff --git a/.gitignore b/.gitignore index 8301c12b..ebb73586 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ .DS_Store .vimspector.json /CMakeSettings.json +/talk-llama.dSYM/ build/ build-*/ diff --git a/Makefile b/Makefile index 547ef3f4..32b7cbb1 100644 --- a/Makefile +++ b/Makefile @@ -512,9 +512,6 @@ ifdef GGML_CUDA OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(OBJ_CUDA_TMPL) - - #OBJ_WHISPER += src/whisper-mel-cuda.o - ifdef WHISPER_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # WHISPER_FATAL_WARNINGS @@ -623,10 +620,6 @@ ggml/src/ggml-cuda.o: \ ggml/src/ggml-common.h \ $(wildcard ggml/src/ggml-cuda/*.cuh) $(NVCC_COMPILE) - -#src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp -# $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ - endif # GGML_CUDA ifdef GGML_VULKAN @@ -955,7 +948,6 @@ $(LIB_GGML_S): \ src/whisper.o: \ src/whisper.cpp \ - src/whisper-mel.hpp \ include/whisper.h \ ggml/include/ggml.h \ ggml/include/ggml-alloc.h \ diff --git a/bindings/ruby/ext/extconf.rb b/bindings/ruby/ext/extconf.rb index f19ee05f..55189282 100644 --- a/bindings/ruby/ext/extconf.rb +++ b/bindings/ruby/ext/extconf.rb @@ -1,7 +1,6 @@ require 'mkmf' system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .") -system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6ac22558..ff54d645 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -78,43 +78,13 @@ if (WHISPER_OPENVINO) set_target_properties(${TARGET} PROPERTIES FOLDER "libs") endif() -#if (GGML_CUDA) -# cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES -# -# find_package(CUDAToolkit) -# if (CUDAToolkit_FOUND) -# message(STATUS "CUDA found") -# -# if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) -# # 52 == lowest CUDA 12 standard -# # 60 == f16 CUDA intrinsics -# # 61 == integer CUDA intrinsics -# # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster -# set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics -# endif() -# message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") -# -# enable_language(CUDA) -# else() -# message(WARNING "CUDA not found") -# endif() -#endif() - # whisper add_library(whisper ../include/whisper.h whisper.cpp - whisper-mel.hpp ) -# TODO: disabled because it relies on ggml internals that are no longer accessible (ggml-backend-impl.h, ggml-cuda/common.cuh, ..) -#if (GGML_CUDA) -# target_sources(whisper PRIVATE whisper-mel-cuda.cu) -# -# target_link_libraries(whisper PRIVATE CUDA::cufft) -#endif() - # Set the version numbers set_target_properties(whisper PROPERTIES VERSION ${PROJECT_VERSION} diff --git a/src/whisper-mel-cuda.cu b/src/whisper-mel-cuda.cu deleted file mode 100644 index 85ea23e6..00000000 --- a/src/whisper-mel-cuda.cu +++ /dev/null @@ -1,363 +0,0 @@ -#define CUB_IGNORE_DEPRECATED_CPP_DIALECT -#include "whisper-mel-cuda.hpp" -#include "whisper.h" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4324) // added padding -#endif - -namespace { - -static const char* cufftGetErrorString(cufftResult_t res) { - switch (res) { - case CUFFT_SUCCESS: return "The cuFFT operation was successful"; - case CUFFT_INVALID_PLAN: return "cuFFT was passed an invalid plan handle"; - case CUFFT_ALLOC_FAILED: return "cuFFT failed to allocate GPU or CPU memory"; - case CUFFT_INVALID_TYPE: return "No longer used"; - case CUFFT_INVALID_VALUE: return "User specified an invalid pointer or parameter"; - case CUFFT_INTERNAL_ERROR: return "Driver or internal cuFFT library error"; - case CUFFT_EXEC_FAILED: return "Failed to execute an FFT on the GPU"; - case CUFFT_SETUP_FAILED: return "The cuFFT library failed to initialize"; - case CUFFT_INVALID_SIZE: return "User specified an invalid transform size"; - case CUFFT_UNALIGNED_DATA: return "No longer used"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: return "Missing parameters in call"; - case CUFFT_INVALID_DEVICE: return "Execution of a plan was on different GPU than plan creation"; - case CUFFT_PARSE_ERROR: return "Internal plan database error"; - case CUFFT_NO_WORKSPACE: return "No workspace has been provided prior to plan execution"; - case CUFFT_NOT_IMPLEMENTED: return "Function does not implement functionality for parameters given."; - case CUFFT_LICENSE_ERROR: return "Used in previous versions."; - case CUFFT_NOT_SUPPORTED: return "Operation is not supported for parameters given."; - default: return "Unknown error"; - } -} - -#define CUFFT_CHECK(err) CUDA_CHECK_GEN(err, CUFFT_SUCCESS, cufftGetErrorString) - -__global__ void k_fill_stft_input( - const float * padded_samples, - const int n_frames, - const float * hann_window, - float * stft_in -) { - auto y = blockIdx.y * blockDim.y + threadIdx.y; - // if (y >= n_frames) return; - auto x = blockIdx.x * blockDim.x + threadIdx.x; - // if (x >= WHISPER_N_FFT) return; - - auto line = padded_samples + y * WHISPER_HOP_LENGTH; - auto outLine = stft_in + y * WHISPER_N_FFT; - - outLine[x] = line[x] * hann_window[x]; -} - -__global__ void k_calc_magnitudes( - const cuComplex * stft_out, - const int n_frames, - float * magnitudes -) { - auto y = blockIdx.y * blockDim.y + threadIdx.y; - // if (y >= n_frames) return; - auto x = blockIdx.x * blockDim.x + threadIdx.x; - // if (x >= WHISPER_N_FFT_HALF) return; - - auto idx = y * WHISPER_N_FFT_HALF + x; - - auto r = stft_out[idx].x; - auto i = stft_out[idx].y; - magnitudes[idx] = r * r + i * i; -} - -__global__ void k_calc_log_mel( - const float * mel_data, - const int n_mel, - const float * max_val, - float * log_mel -) { - auto x = blockIdx.x * blockDim.x + threadIdx.x; - if (x >= n_mel) return; - - float val = mel_data[x]; - - constexpr float e = 1e-10f; - if (val < e) val = e; - - val = log10(val); - - const float max = log10(*max_val) - 8.f; - if (val < max) val = max; - - log_mel[x] = (val + 4) / 4; -} - -static void fill_stft_input( - const float * padded_samples, - int n_frames, - const float * hann_window, - float * stft_in, - cudaStream_t stream -) { - dim3 block(WHISPER_N_FFT, 1); - dim3 grid(1, n_frames); - - k_fill_stft_input<<>>(padded_samples, n_frames, hann_window, stft_in); -} - -static void calc_magnitudes( - const cuComplex * stft_out, - int n_frames, - float * magnitudes, - cudaStream_t stream -) { - dim3 block(WHISPER_N_FFT_HALF, 1); - dim3 grid(1, n_frames); - k_calc_magnitudes<<>>(stft_out, n_frames, magnitudes); -} - -constexpr auto LOG_MEL_PREFIX_SIZE = 256; - -static void calc_log_mel( - const float * mel_data, - int n_mel, - void * tempStorage, - int tempStorageSize, - float * log_mel, - cudaStream_t stream -) { - float * max_val = reinterpret_cast(tempStorage); - void * maxTemp = reinterpret_cast(tempStorage) + LOG_MEL_PREFIX_SIZE; - - size_t nbytes = size_t(tempStorageSize - LOG_MEL_PREFIX_SIZE); - cub::DeviceReduce::Max(maxTemp, nbytes, mel_data, max_val, n_mel, stream); - - int block = 256; - int grid = (n_mel + block - 1) / block; - - k_calc_log_mel<<>>(mel_data, n_mel, max_val, log_mel); -} - -class mel_calc_cuda : public whisper_mel_calc { - const int m_n_mel; - - ggml_backend_t m_backend = nullptr; - int m_device = -1; - - cudaStream_t m_stream = nullptr; - cublasHandle_t m_cublas_handle = nullptr; - - float * m_hann_window = nullptr; - - float * m_filters = nullptr; - - // max samples for which we have allocated memory for the temp working areas below (cufft, log_mel) - int m_n_max_samples = 0; - - size_t m_cufft_workspace_size = 0; - void * m_cufft_workspace = nullptr; - - size_t m_log_mel_temp_storage_size = 0; - void * m_log_mel_temp_storage = nullptr; -public: - mel_calc_cuda(ggml_backend_t backend, const whisper_filters & filters) - : m_n_mel(filters.n_mel) - , m_backend(backend) - { - ggml_backend_cuda_context* cuda_ctx = (ggml_backend_cuda_context*)m_backend->context; - m_device = cuda_ctx->device; - - if (ggml_cuda_info().devices[m_device].cc < 600) { - // we've only tesed on 6.0 and higher and we've had reports of crashes on 5.0: - // https://github.com/ggerganov/whisper.cpp/issues/2230 - // to be safe forbid anything below 6.0 - throw std::runtime_error("CUDA compute capability 6.0 or higher is required"); - } - - ggml_cuda_set_device(m_device); - - if (filters.n_fft != WHISPER_N_FFT_HALF) { - throw std::invalid_argument("MelFilters n_frames must be WHISPER_N_FFT_HALF"); - } - assert(filters.data.size() == filters.n_mel * WHISPER_N_FFT_HALF); - - CUDA_CHECK(cudaStreamCreate(&m_stream)); - CUBLAS_CHECK(cublasCreate(&m_cublas_handle)); - CUBLAS_CHECK(cublasSetMathMode(m_cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH)); - CUBLAS_CHECK(cublasSetStream(m_cublas_handle, m_stream)); - - // create Hann window - { - auto hw = whisper_mel_calc::hann_window(); - CUDA_CHECK(cudaMallocAsync(&m_hann_window, hw.len * sizeof(float), m_stream)); - CUDA_CHECK(cudaMemcpyAsync(m_hann_window, hw.data, hw.len * sizeof(float), cudaMemcpyHostToDevice, m_stream)); - } - - // fill filters - { - auto& f = filters.data; - CUDA_CHECK(cudaMallocAsync(&m_filters, f.size() * sizeof(float), m_stream)); - CUDA_CHECK(cudaMemcpyAsync(m_filters, f.data(), f.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream)); - } - - // preallocate working areas enough for the most common cases (<= 30s) - ensure_working_areas(WHISPER_N_SAMPLES); - } - - ~mel_calc_cuda() { - ggml_cuda_set_device(m_device); - CUDA_CHECK(cudaStreamSynchronize(m_stream)); - CUDA_CHECK(cudaStreamDestroy(m_stream)); - CUDA_CHECK(cudaFree(m_hann_window)); - CUDA_CHECK(cudaFree(m_cufft_workspace)); - CUDA_CHECK(cudaFree(m_filters)); - CUDA_CHECK(cudaFree(m_log_mel_temp_storage)); - } - - void ensure_working_areas(int n_samples) { - if (n_samples <= m_n_max_samples) { - return; - } - - const auto max_padded_samples = n_samples + WHISPER_N_SAMPLES + WHISPER_N_FFT; - const auto max_frames = 1 + (max_padded_samples - WHISPER_N_FFT) / WHISPER_HOP_LENGTH; - - // cufft workspace - { - if (m_cufft_workspace) { - CUDA_CHECK(cudaFree(m_cufft_workspace)); - m_cufft_workspace_size = 0; - m_cufft_workspace = nullptr; - } - CUFFT_CHECK(cufftEstimate1d(WHISPER_N_FFT, CUFFT_R2C, max_frames, &m_cufft_workspace_size)); - CUDA_CHECK(cudaMallocAsync(&m_cufft_workspace, m_cufft_workspace_size, m_stream)); - } - - // device reduce working area - { - if (m_log_mel_temp_storage) { - CUDA_CHECK(cudaFree(m_log_mel_temp_storage)); - m_log_mel_temp_storage_size = 0; - m_log_mel_temp_storage = nullptr; - } - - const auto max_mels = 160; - - size_t nbytes = 0; - float* temp = nullptr; - cub::DeviceReduce::Max(nullptr, nbytes, temp, temp, max_frames * max_mels); - m_log_mel_temp_storage_size = nbytes + LOG_MEL_PREFIX_SIZE; - - CUDA_CHECK(cudaMallocAsync(&m_log_mel_temp_storage, m_log_mel_temp_storage_size, m_stream)); - } - - m_n_max_samples = n_samples; - } - - virtual whisper_mel calculate(whisper_span samples, int /*n_threads*/) override { - ggml_cuda_set_device(m_device); - ensure_working_areas(samples.len); - - const size_t mirror_pad = WHISPER_N_FFT / 2; - const size_t padded_size = samples.len + WHISPER_N_SAMPLES + WHISPER_N_FFT; - - // pad - std::vector padded_samples(padded_size); - std::reverse_copy(samples.data + 1, samples.data + 1 + mirror_pad, padded_samples.begin()); // reflect - std::copy(samples.data, samples.data + samples.len, padded_samples.begin() + mirror_pad); // copy - - // fill the rest of the data - // it should canonically be mirrored at the end as well, - // but we just assume the last MEL_FRAME_SIZE/2 samples are zeros - std::fill(padded_samples.begin() + mirror_pad + samples.len, padded_samples.end(), 0.f); - - const auto n_frames = 1 + (padded_samples.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH; - - float * cu_padded_samples = nullptr; - CUDA_CHECK(cudaMallocAsync(&cu_padded_samples, padded_samples.size() * sizeof(float), m_stream)); - CUDA_CHECK(cudaMemcpyAsync(cu_padded_samples, padded_samples.data(), padded_samples.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream)); - - float * stft_in = nullptr; // contiguous buffer for stft input - CUDA_CHECK(cudaMallocAsync(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float), m_stream)); - - fill_stft_input(cu_padded_samples, int(n_frames), m_hann_window, stft_in, m_stream); - - cufftComplex* stft_out; - CUDA_CHECK(cudaMallocAsync(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex), m_stream)); - - cufftHandle plan; - CUFFT_CHECK(cufftCreate(&plan)); - CUFFT_CHECK(cufftSetAutoAllocation(plan, 0)); - { - size_t waSize; - CUFFT_CHECK(cufftMakePlan1d(plan, WHISPER_N_FFT, CUFFT_R2C, int(n_frames), &waSize)); - assert(waSize <= m_cufft_workspace_size); - CUFFT_CHECK(cufftSetWorkArea(plan, m_cufft_workspace)); - CUFFT_CHECK(cufftSetStream(plan, m_stream)); - } - CUFFT_CHECK(cufftExecR2C(plan, stft_in, stft_out)); - - const auto n_mag_frames = n_frames - 1; // drop last frame - float * magnitudes; - CUDA_CHECK(cudaMallocAsync(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float), m_stream)); - calc_magnitudes(stft_out, int(n_mag_frames), magnitudes, m_stream); - - float * mel_data = nullptr; - CUDA_CHECK(cudaMallocAsync(&mel_data, m_n_mel * n_mag_frames * sizeof(float), m_stream)); - - const float fone = 1.0f, fzero = 0.0f; - CUBLAS_CHECK(cublasSgemm(m_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, - int(n_mag_frames), m_n_mel, WHISPER_N_FFT_HALF, - &fone, - magnitudes, WHISPER_N_FFT_HALF, - m_filters, WHISPER_N_FFT_HALF, - &fzero, - mel_data, int(n_mag_frames))); - - whisper_mel ret; - // Calculate semi-padded sample length to ensure compatibility - int n_len_org = 1 + int(samples.len + mirror_pad - WHISPER_N_FFT) / WHISPER_HOP_LENGTH; - whisper_mel_init(ret, m_backend, int(n_mag_frames), n_len_org, m_n_mel); - assert(ggml_nbytes(ret.tensor) == m_n_mel * n_mag_frames * sizeof(float)); - - float* log_mels = reinterpret_cast(ret.tensor->data); - - calc_log_mel( - mel_data, int(m_n_mel * n_mag_frames), - m_log_mel_temp_storage , int(m_log_mel_temp_storage_size), - log_mels, m_stream); - - CUDA_CHECK(cudaStreamSynchronize(m_stream)); - - // cleanup - CUFFT_CHECK(cufftDestroy(plan)); - CUDA_CHECK(cudaFreeAsync(mel_data, m_stream)); - CUDA_CHECK(cudaFreeAsync(magnitudes, m_stream)); - CUDA_CHECK(cudaFreeAsync(stft_out, m_stream)); - CUDA_CHECK(cudaFreeAsync(stft_in, m_stream)); - CUDA_CHECK(cudaFreeAsync(cu_padded_samples, m_stream)); - - return ret; - } -}; - -} - -whisper_mel_calc * whisper_mel_calc_create_cuda(ggml_backend_t backend, const whisper_filters & filters) { - try { - return new mel_calc_cuda(backend, filters); - } - catch (...) { - // TODO: log error (but for this we would have to expose the log state to be accessible here) - return nullptr; - } -} diff --git a/src/whisper-mel-cuda.hpp b/src/whisper-mel-cuda.hpp deleted file mode 100644 index 2acb6505..00000000 --- a/src/whisper-mel-cuda.hpp +++ /dev/null @@ -1,3 +0,0 @@ -#include "whisper-mel.hpp" - -whisper_mel_calc * whisper_mel_calc_create_cuda(ggml_backend_t backend, const whisper_filters & filters); diff --git a/src/whisper-mel.hpp b/src/whisper-mel.hpp deleted file mode 100644 index f4210b41..00000000 --- a/src/whisper-mel.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once -#include "ggml-backend.h" -#include - -struct whisper_mel { - int n_len_org = 0; - - ggml_context * ctx = nullptr; - ggml_tensor * tensor = nullptr; - ggml_backend_buffer_t buffer = nullptr; -}; - -void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel); - -void whisper_mel_free(whisper_mel & mel); - -struct whisper_filters { - int32_t n_mel; - int32_t n_fft; - - std::vector data; -}; - -template -struct whisper_span { - T * data; - int len; -}; - -struct whisper_mel_calc { - virtual ~whisper_mel_calc(); - virtual whisper_mel calculate(whisper_span samples, int n_threads) = 0; - static whisper_span hann_window(); -}; diff --git a/src/whisper.cpp b/src/whisper.cpp index 43ddf80f..4931b0dd 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -10,7 +10,6 @@ #ifdef GGML_USE_CUDA #include "ggml-cuda.h" -#include "whisper-mel-cuda.hpp" #endif #ifdef GGML_USE_SYCL @@ -37,8 +36,6 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#include "whisper-mel.hpp" - #include #include #include @@ -401,6 +398,21 @@ static const std::map g_aheads { static std::vector get_alignment_heads_by_layer(const whisper_context_params & cparams, int il, int32_t n_text_layer, int32_t n_head); +struct whisper_mel { + int n_len; + int n_len_org; + int n_mel; + + std::vector data; +}; + +struct whisper_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector data; +}; + struct whisper_vocab { using id = int32_t; using token = std::string; @@ -830,8 +842,6 @@ struct whisper_state { whisper_kv_cache kv_pad; whisper_mel mel; - whisper_mel_calc * mel_calc = nullptr; - whisper_mel_calc * mel_calc_fallback = nullptr; whisper_batch batch; @@ -850,6 +860,7 @@ struct whisper_state { struct ggml_tensor * embd_enc = nullptr; // helpers for GPU offloading + std::vector inp_mel; std::vector inp_mask; // decode output (2-dimensional array: [n_tokens][n_vocab]) @@ -1912,8 +1923,7 @@ static bool whisper_encode_external(const whisper_state & wstate) { static struct ggml_cgraph * whisper_build_graph_conv( whisper_context & wctx, - whisper_state & wstate, - const int mel_offset) { + whisper_state & wstate) { const auto & model = wctx.model; const auto & hparams = model.hparams; @@ -1932,35 +1942,9 @@ static struct ggml_cgraph * whisper_build_graph_conv( ggml_cgraph * gf = ggml_new_graph(ctx0); - GGML_ASSERT(wstate.mel.tensor); - - ggml_tensor * mel_inp = wstate.mel.tensor; - ggml_set_input(mel_inp); - - ggml_tensor * mel; - if (ggml_nelements(mel_inp) > 0) { - const int n_len = int(mel_inp->ne[0]); - const int out_s = 2 * n_ctx; - const int i0 = std::min(mel_offset, n_len); - const int i1 = std::min(mel_offset + out_s, n_len); - const int mel_s = i1 - i0; - - assert(mel_inp->type == GGML_TYPE_F32); - assert(mel_inp->ne[1] == n_mels); - - ggml_tensor * cur = ggml_view_2d(ctx0, mel_inp, out_s, n_mels, mel_inp->nb[1], ggml_row_size(mel_inp->type, i0)); - - if (mel_s < out_s) { - mel = ggml_pad(ctx0, cur, out_s - mel_s, 0, 0, 0); - } else { - mel = ggml_cont(ctx0, cur); - } - } else { - // empty mel - just create a dummy tensor with the correct size - mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels); - } - + struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels); ggml_set_name(mel, "mel"); + ggml_set_input(mel); struct ggml_tensor * cur = nullptr; @@ -2332,21 +2316,45 @@ static bool whisper_encode_internal( { auto & sched = wstate.sched_conv.sched; - ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset); + ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate); if (!ggml_backend_sched_alloc_graph(sched, gf)) { // should never happen as we pre-allocate the memory return false; } - if (!ggml_graph_compute_helper(sched, gf, n_threads)) { - return false; + struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel"); + + // set the input + { + const auto & mel_inp = wstate.mel; + const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; + + assert(mel->type == GGML_TYPE_F32); + assert(mel_inp.n_mel == wctx.model.hparams.n_mels); + + wstate.inp_mel.resize(ggml_nelements(mel)); + + float * dst = wstate.inp_mel.data(); + memset(dst, 0, ggml_nbytes(mel)); + + const int i0 = std::min(mel_offset, mel_inp.n_len); + const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); + + for (int j = 0; j < mel_inp.n_mel; ++j) { + for (int i = i0; i < i1; ++i) { + dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i]; + } + } + + ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float)); } - if (whisper_encode_external(wstate)) { - ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel"); - assert(mel->ne[1] == wctx.model.hparams.n_mels); - GGML_UNUSED(mel); + if (!whisper_encode_external(wstate)) { + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { + return false; + } + } else { #if defined(WHISPER_USE_COREML) whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data); #elif defined(WHISPER_USE_OPENVINO) @@ -2970,35 +2978,6 @@ struct whisper_global_cache { } global_cache; } -// Mel spectrogram - -void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) { - //WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel); - mel.n_len_org = n_len_org; - assert(!mel.ctx); - mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true}); - mel.tensor = ggml_new_tensor_2d(mel.ctx, GGML_TYPE_F32, n_len, n_mel); - mel.buffer = ggml_backend_alloc_buffer(backend, ggml_nbytes(mel.tensor) + ggml_backend_get_alignment(backend)); - auto alloc = ggml_tallocr_new(mel.buffer); - ggml_tallocr_alloc(&alloc, mel.tensor); -} - -void whisper_mel_free(whisper_mel & mel) { - ggml_free(mel.ctx); - ggml_backend_buffer_free(mel.buffer); - - mel.n_len_org = 0; - mel.ctx = nullptr; - mel.tensor = nullptr; - mel.buffer = nullptr; -} - -whisper_mel_calc::~whisper_mel_calc() = default; // export vtable - -whisper_span whisper_mel_calc::hann_window() { - return {global_cache.hann_window, WHISPER_N_FFT}; -} - // naive Discrete Fourier Transform // input is real-valued // output is complex-valued @@ -3068,22 +3047,12 @@ static void fft(float* in, int N, float* out) { } } -namespace { - -struct whisper_mel_data { - int n_len; - int n_len_org; - int n_mel; - float * data; -}; - -void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, - int n_samples, int n_threads, - const whisper_filters & filters, whisper_mel_data & mel) { - const auto frame_size = WHISPER_N_FFT; - const auto frame_step = WHISPER_HOP_LENGTH; +static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, + int n_samples, int frame_size, int frame_step, int n_threads, + const whisper_filters & filters, whisper_mel & mel) { std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); + int n_fft = filters.n_fft; int i = ith; @@ -3098,6 +3067,7 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { fft_in[j] = hann[j] * samples[offset + j]; } + // fill the rest with zeros if (n_samples - offset < frame_size) { std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); @@ -3115,7 +3085,6 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v // mel spectrogram for (int j = 0; j < mel.n_mel; j++) { double sum = 0.0; - // unroll loop (suggested by GH user @lunixbochs) int k = 0; for (k = 0; k < n_fft - 3; k += 4) { @@ -3125,14 +3094,11 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v fft_out[k + 2] * filters.data[j * n_fft + k + 2] + fft_out[k + 3] * filters.data[j * n_fft + k + 3]; } - // handle n_fft remainder for (; k < n_fft; k++) { sum += fft_out[k] * filters.data[j * n_fft + k]; } - sum = log10(std::max(sum, 1e-10)); - mel.data[j * mel.n_len + i] = sum; } } @@ -3146,116 +3112,97 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v } } -struct mel_calc_cpu : public whisper_mel_calc { - ggml_backend_t m_backend; - const whisper_filters & m_filters; - mel_calc_cpu(ggml_backend_t backend, const whisper_filters & filters) : m_backend(backend), m_filters(filters) {} +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 +static bool log_mel_spectrogram( + whisper_state & wstate, + const float * samples, + const int n_samples, + const int /*sample_rate*/, + const int frame_size, + const int frame_step, + const int n_mel, + const int n_threads, + const whisper_filters & filters, + const bool debug, + whisper_mel & mel) { + const int64_t t_start_us = ggml_time_us(); - // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 - whisper_mel calculate(whisper_span ssamples, int n_threads) override { - // Hann window - const float * hann = global_cache.hann_window; + // Hann window + WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size"); + const float * hann = global_cache.hann_window; - // Calculate the length of padding - int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; - int64_t stage_2_pad = WHISPER_N_FFT / 2; + // Calculate the length of padding + int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; + int64_t stage_2_pad = frame_size / 2; - const int n_samples = int(ssamples.len); - const float * samples = ssamples.data; + // Initialize a vector and copy data from C array to it. + std::vector samples_padded; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); - // Initialize a vector and copy data from C array to it. - std::vector samples_padded; - samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); - std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); - // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio - std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + // reflective pad 200 samples at the beginning of audio + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); - // reflective pad 200 samples at the beginning of audio - std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); + mel.n_mel = n_mel; + // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 + // Calculate number of frames + remove the last frame + mel.n_len = (samples_padded.size() - frame_size) / frame_step; + // Calculate semi-padded sample length to ensure compatibility + mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step; + mel.data.resize(mel.n_mel * mel.n_len); - whisper_mel_data mel; - mel.n_mel = m_filters.n_mel; - // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 - // Calculate number of frames + remove the last frame - mel.n_len = (samples_padded.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH; - // Calculate semi-padded sample length to ensure compatibility - mel.n_len_org = 1 + (n_samples + stage_2_pad - WHISPER_N_FFT) / WHISPER_HOP_LENGTH; - - std::vector host_mel_data; - - whisper_mel ret; - whisper_mel_init(ret, m_backend, mel.n_len, mel.n_len_org, mel.n_mel); - if (ggml_backend_buffer_is_host(ret.buffer)) { - mel.data = reinterpret_cast(ret.tensor->data); - } else { - host_mel_data.resize(mel.n_len * mel.n_mel); - mel.data = host_mel_data.data(); + { + std::vector workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = std::thread( + log_mel_spectrogram_worker_thread, iw + 1, hann, samples_padded, + n_samples + stage_2_pad, frame_size, frame_step, n_threads, + std::cref(filters), std::ref(mel)); } - { - std::vector workers(n_threads - 1); - for (int iw = 0; iw < n_threads - 1; ++iw) { - workers[iw] = std::thread( - log_mel_spectrogram_worker_thread, iw + 1, hann, samples_padded, - n_samples + stage_2_pad, n_threads, - std::cref(m_filters), std::ref(mel)); - } + // main thread + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel); - // main thread - log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, n_threads, m_filters, mel); - - for (int iw = 0; iw < n_threads - 1; ++iw) { - workers[iw].join(); - } - } - - // clamping and normalization - double mmax = -1e20; - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] > mmax) { - mmax = mel.data[i]; - } - } - - mmax -= 8.0; - - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] < mmax) { - mel.data[i] = mmax; - } - - mel.data[i] = (mel.data[i] + 4.0)/4.0; - } - - if (!host_mel_data.empty()) { - // the ret buffer is not host-accessible so we used this temporary buffer and now we need to upload it - ggml_backend_tensor_set(ret.tensor, host_mel_data.data(), 0, ggml_nbytes(ret.tensor)); - } - - return ret; - } -}; -} - -static whisper_mel_calc * whisper_mel_calc_create(ggml_backend_t backend, const whisper_filters & filters) { -// TODO: disabled because it relies on ggml internals that are no longer accessible (ggml-backend-impl.h, ggml-cuda/common.cuh, ..) -//#if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS) -#if 0 - if (ggml_backend_is_cuda(backend)) { - auto ret = whisper_mel_calc_create_cuda(backend, filters); - if (ret) { - // run a warmup to avoid the first kernel launch overhead (thus we get the best perf even on the first run) - const float warmup[256] = { 0 }; - ret->calculate({ warmup, 256 }, 1); - return ret; + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); } } -#endif - // a specialized mel_calc could not be created - // fall back to CPU - return new mel_calc_cpu(backend, filters); + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] > mmax) { + mmax = mel.data[i]; + } + } + + mmax -= 8.0; + + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] < mmax) { + mel.data[i] = mmax; + } + + mel.data[i] = (mel.data[i] + 4.0)/4.0; + } + + wstate.t_mel_us += ggml_time_us() - t_start_us; + + // Dump log_mel_spectrogram + if (debug) { + std::ofstream outFile("log_mel_spectrogram.json"); + outFile << "["; + for (uint64_t i = 0; i < mel.data.size() - 1; i++) { + outFile << mel.data[i] << ", "; + } + outFile << mel.data[mel.data.size() - 1] << "]"; + outFile.close(); + } + + return true; } // split text into tokens @@ -3380,17 +3327,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - state->mel_calc = whisper_mel_calc_create(state->backends[0], ctx->model.filters); - - // init 60s of random mel data - { - const int n_len = 2*100*WHISPER_CHUNK_SIZE; - const int n_mel = ctx->model.filters.n_mel; - - whisper_mel_free(state->mel); - whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel); - } - // at this point, we don't know yet how many decoders will be used // later during decoding, if more decoders are used, we will recreate the KV cache respectively state->kv_self_n_dec = 1; @@ -3483,7 +3419,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { { bool ok = whisper_sched_graph_init(state->sched_conv, state->backends, [&]() { - return whisper_build_graph_conv(*ctx, *state, 0); + return whisper_build_graph_conv(*ctx, *state); }); if (!ok) { @@ -3805,13 +3741,6 @@ void whisper_free_state(struct whisper_state * state) { whisper_kv_cache_free(state->kv_cross); whisper_kv_cache_free(state->kv_pad); - whisper_mel_free(state->mel); - - delete state->mel_calc; - state->mel_calc = nullptr; - delete state->mel_calc_fallback; - state->mel_calc_fallback = nullptr; - #ifdef WHISPER_USE_COREML if (state->ctx_coreml != nullptr) { whisper_coreml_free(state->ctx_coreml); @@ -3869,37 +3798,11 @@ void whisper_free_params(struct whisper_full_params * params) { } int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - const int64_t t_start_us = ggml_time_us(); - - whisper_mel_free(state->mel); - if (n_samples <= 5 * 60 * WHISPER_SAMPLE_RATE) { - // calculate mel spectrogram for lengths up to 5 minutes on the most optimal mel calculator - state->mel = state->mel_calc->calculate({samples, n_samples}, n_threads); - } else { - // calcuate mel spectrogram for longer audios on the CPU - // 1. gpu calculations may use hundreds of megabytes of memory for longer audios so we're being conservative - // with our gpu demands - // 2. the time to transcribe audios this long will be dominated by the decoding time, so the mel calculation - // taking longer is not a major concern - if (!state->mel_calc_fallback) { - state->mel_calc_fallback = new mel_calc_cpu(state->backends[0], ctx->model.filters); - } - state->mel = state->mel_calc_fallback->calculate({samples, n_samples}, n_threads); + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { + WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__); + return -1; } - state->t_mel_us += ggml_time_us() - t_start_us; - - // Dump log_mel_spectrogram - //{ - // auto& mel = state->mel; - // std::ofstream outFile("log_mel_spectrogram.json"); - // outFile << "["; - // for (uint64_t i = 0; i < mel.data.size() - 1; i++) { - // outFile << mel.data[i] << ", "; - // } - // outFile << mel.data[mel.data.size() - 1] << "]"; - // outFile.close(); - //} return 0; } @@ -3918,10 +3821,12 @@ int whisper_set_mel_with_state( return -1; } - whisper_mel_free(state->mel); - whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel); + state->mel.n_len = n_len; + state->mel.n_len_org = n_len; + state->mel.n_mel = n_mel; - ggml_backend_tensor_set(state->mel.tensor, data, 0, ggml_nbytes(state->mel.tensor)); + state->mel.data.resize(n_len*n_mel); + memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float)); return 0; }