mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-21 13:37:47 +00:00
whisper : add GPU support via cuBLAS (#834)
* make : add WHISPER_CUBLAS * make : fix CUBLAS build * whisper : disable Flash Attention + adjust memory buffers * whisper : remove old commented code * readme : add cuBLAS instructions * cmake : add WHISPER_CUBLAS option * gitignore : ignore build-cublas
This commit is contained in:
parent
0ccd6746c9
commit
5fd1bdd7fc
1
.gitignore
vendored
1
.gitignore
vendored
@ -12,6 +12,7 @@ build-em/
|
|||||||
build-debug/
|
build-debug/
|
||||||
build-release/
|
build-release/
|
||||||
build-static/
|
build-static/
|
||||||
|
build-cublas/
|
||||||
build-no-accel/
|
build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
|
@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
|
|||||||
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
|
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
|
||||||
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
|
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
|
||||||
|
|
||||||
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
|
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
|
||||||
|
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
||||||
@ -62,7 +62,8 @@ if (APPLE)
|
|||||||
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
||||||
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
|
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
|
||||||
else()
|
else()
|
||||||
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
||||||
|
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
option(WHISPER_PERF "whisper: enable perf timings" OFF)
|
option(WHISPER_PERF "whisper: enable perf timings" OFF)
|
||||||
@ -127,7 +128,7 @@ if (APPLE)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (WHISPER_SUPPORT_OPENBLAS)
|
if (WHISPER_OPENBLAS)
|
||||||
find_library(OPENBLAS_LIB
|
find_library(OPENBLAS_LIB
|
||||||
NAMES openblas libopenblas
|
NAMES openblas libopenblas
|
||||||
)
|
)
|
||||||
@ -141,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (WHISPER_CUBLAS)
|
||||||
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
|
||||||
|
find_package(CUDAToolkit)
|
||||||
|
|
||||||
|
if (CUDAToolkit_FOUND)
|
||||||
|
message(STATUS "cuBLAS found")
|
||||||
|
|
||||||
|
enable_language(CUDA)
|
||||||
|
|
||||||
|
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
|
if (WHISPER_STATIC)
|
||||||
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
else()
|
||||||
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
else()
|
||||||
|
message(WARNING "cuBLAS not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# compiler flags
|
# compiler flags
|
||||||
|
|
||||||
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
||||||
@ -247,6 +273,7 @@ set(TARGET whisper)
|
|||||||
add_library(${TARGET}
|
add_library(${TARGET}
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml.c
|
ggml.c
|
||||||
|
${GGML_CUDA_SOURCES}
|
||||||
whisper.h
|
whisper.h
|
||||||
whisper.cpp
|
whisper.cpp
|
||||||
)
|
)
|
||||||
@ -279,6 +306,12 @@ if (BUILD_SHARED_LIBS)
|
|||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_SOURCES)
|
||||||
|
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
||||||
|
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
|
||||||
|
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
|
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||||
endif()
|
endif()
|
||||||
|
28
Makefile
28
Makefile
@ -1,3 +1,5 @@
|
|||||||
|
default: main bench
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
endif
|
endif
|
||||||
@ -157,6 +159,18 @@ ifdef WHISPER_OPENBLAS
|
|||||||
LDFLAGS += -lopenblas
|
LDFLAGS += -lopenblas
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef WHISPER_CUBLAS
|
||||||
|
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
|
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
|
WHISPER_OBJ += ggml-cuda.o
|
||||||
|
NVCC = nvcc
|
||||||
|
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||||
|
|
||||||
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef WHISPER_GPROF
|
ifdef WHISPER_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
CXXFLAGS += -pg
|
CXXFLAGS += -pg
|
||||||
@ -200,20 +214,18 @@ $(info I CC: $(CCV))
|
|||||||
$(info I CXX: $(CXXV))
|
$(info I CXX: $(CXXV))
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
default: main bench
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
whisper.o: whisper.cpp whisper.h ggml.h
|
whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
|
||||||
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ifndef WHISPER_COREML
|
ifndef WHISPER_COREML
|
||||||
WHISPER_OBJ = whisper.o
|
WHISPER_OBJ += whisper.o
|
||||||
else
|
else
|
||||||
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
|
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
|
||||||
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
|
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
|
||||||
@ -221,7 +233,7 @@ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
|
|||||||
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
|
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
|
||||||
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
|
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
|
||||||
|
|
||||||
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
|
WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
libwhisper.a: ggml.o $(WHISPER_OBJ)
|
libwhisper.a: ggml.o $(WHISPER_OBJ)
|
||||||
|
16
README.md
16
README.md
@ -18,6 +18,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
|||||||
- Low memory usage (Flash Attention)
|
- Low memory usage (Flash Attention)
|
||||||
- Zero memory allocations at runtime
|
- Zero memory allocations at runtime
|
||||||
- Runs on the CPU
|
- Runs on the CPU
|
||||||
|
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
|
||||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
|
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
|
||||||
|
|
||||||
Supported platforms:
|
Supported platforms:
|
||||||
@ -281,10 +282,23 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
|||||||
|
|
||||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||||
|
|
||||||
|
## NVIDIA GPU support via cuBLAS
|
||||||
|
|
||||||
|
With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
|
||||||
|
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
|
||||||
|
|
||||||
|
Now build `whisper.cpp` with cuBLAS support:
|
||||||
|
|
||||||
|
```
|
||||||
|
make clean
|
||||||
|
WHISPER_CUBLAS=1 make -j
|
||||||
|
```
|
||||||
|
|
||||||
|
Run all the examples as usual.
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
- Inference only
|
- Inference only
|
||||||
- No GPU support (yet)
|
|
||||||
|
|
||||||
## Another example
|
## Another example
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
|
|||||||
|
|
||||||
# third-party
|
# third-party
|
||||||
|
|
||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# SDL2
|
# SDL2
|
||||||
find_package(SDL2 REQUIRED)
|
find_package(SDL2 REQUIRED)
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ include(DefaultTargetOptions)
|
|||||||
|
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# common-sdl
|
# common-sdl
|
||||||
|
|
||||||
set(TARGET common-sdl)
|
set(TARGET common-sdl)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# command
|
# command
|
||||||
set(TARGET command)
|
set(TARGET command)
|
||||||
add_executable(${TARGET} command.cpp)
|
add_executable(${TARGET} command.cpp)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# stream
|
# stream
|
||||||
set(TARGET stream)
|
set(TARGET stream)
|
||||||
add_executable(${TARGET} stream.cpp)
|
add_executable(${TARGET} stream.cpp)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# talk-llama
|
# talk-llama
|
||||||
set(TARGET talk-llama)
|
set(TARGET talk-llama)
|
||||||
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
if (WHISPER_SUPPORT_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# talk
|
# talk
|
||||||
set(TARGET talk)
|
set(TARGET talk)
|
||||||
#add_executable(${TARGET} talk.cpp gpt-2.cpp)
|
#add_executable(${TARGET} talk.cpp gpt-2.cpp)
|
||||||
|
37
whisper.cpp
37
whisper.cpp
@ -102,7 +102,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
|||||||
#define WHISPER_PRINT_DEBUG(...)
|
#define WHISPER_PRINT_DEBUG(...)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WHISPER_USE_FLASH_ATTN
|
//#define WHISPER_USE_FLASH_ATTN
|
||||||
//#define WHISPER_USE_FLASH_FF
|
//#define WHISPER_USE_FLASH_FF
|
||||||
#define WHISPER_MAX_DECODERS 16
|
#define WHISPER_MAX_DECODERS 16
|
||||||
|
|
||||||
@ -224,11 +224,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
|||||||
static const size_t MB = 1ull*1024*1024;
|
static const size_t MB = 1ull*1024*1024;
|
||||||
|
|
||||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
||||||
{ MODEL_TINY, 14ull*MB },
|
{ MODEL_TINY, 62ull*MB },
|
||||||
{ MODEL_BASE, 18ull*MB },
|
{ MODEL_BASE, 80ull*MB },
|
||||||
{ MODEL_SMALL, 28ull*MB },
|
{ MODEL_SMALL, 120ull*MB },
|
||||||
{ MODEL_MEDIUM, 36ull*MB },
|
{ MODEL_MEDIUM, 158ull*MB },
|
||||||
{ MODEL_LARGE, 44ull*MB },
|
{ MODEL_LARGE, 198ull*MB },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
||||||
@ -280,11 +280,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
|
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
|
||||||
{ MODEL_TINY, 6ull*MB },
|
{ MODEL_TINY, 30ull*MB },
|
||||||
{ MODEL_BASE, 8ull*MB },
|
{ MODEL_BASE, 38ull*MB },
|
||||||
{ MODEL_SMALL, 13ull*MB },
|
{ MODEL_SMALL, 56ull*MB },
|
||||||
{ MODEL_MEDIUM, 22ull*MB },
|
{ MODEL_MEDIUM, 74ull*MB },
|
||||||
{ MODEL_LARGE, 33ull*MB },
|
{ MODEL_LARGE, 94ull*MB },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|
||||||
@ -1554,26 +1554,17 @@ static bool whisper_encode_internal(
|
|||||||
|
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
|
||||||
|
|
||||||
//struct ggml_tensor * V_trans =
|
|
||||||
// ggml_permute(ctx0,
|
|
||||||
// ggml_cpy(ctx0,
|
|
||||||
// Vcur,
|
|
||||||
// ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
|
||||||
// 1, 2, 0, 3);
|
|
||||||
|
|
||||||
//struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
|
||||||
|
|
||||||
struct ggml_tensor * V =
|
struct ggml_tensor * V =
|
||||||
ggml_cpy(ctx0,
|
ggml_cpy(ctx0,
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
Vcur,
|
Vcur,
|
||||||
n_state/n_head, n_head, n_ctx),
|
n_state/n_head, n_head, n_ctx),
|
||||||
0, 2, 1, 3),
|
1, 2, 0, 3),
|
||||||
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
|
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
|
||||||
);
|
);
|
||||||
|
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
#endif
|
#endif
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user