From a0ddd8392c3427e833d893dc79b9b50f43cf8c9e Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:52:24 +0530 Subject: [PATCH] whisper : add SYCL support (#1863) * add changes from llama upstream * add sycl abstraction * add sycl build * update cmake * add sycl build config * fix bug * fix bug * refactor build * fix bug * update build * call build * use sycl header * add examples * add target * fix typecast in quant.c * readd fp16 and readme * fix quant typecast * add sample * add readme * remove cxx file check --- .github/workflows/build.yml | 100 +++++++++++++ CMakeLists.txt | 47 +++++- README_sycl.md | 249 +++++++++++++++++++++++++++++++ examples/CMakeLists.txt | 3 + examples/sycl/CMakeLists.txt | 9 ++ examples/sycl/README.md | 47 ++++++ examples/sycl/build.sh | 19 +++ examples/sycl/ls-sycl-device.cpp | 11 ++ examples/sycl/run-whisper.sh | 17 +++ whisper.cpp | 14 ++ 10 files changed, 510 insertions(+), 6 deletions(-) create mode 100644 README_sycl.md create mode 100644 examples/sycl/CMakeLists.txt create mode 100644 examples/sycl/README.md create mode 100644 examples/sycl/build.sh create mode 100644 examples/sycl/ls-sycl-device.cpp create mode 100644 examples/sycl/run-whisper.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b36491ce..7355d70f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -150,6 +150,106 @@ jobs: make ctest -L gh --output-on-failure' + ubuntu-22-cmake-sycl: + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + dwhisper_sycl: [ON] + dcmake_c_compiler: [icx] + dcmake_cxx_compiler: [icpx] + arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le] + + continue-on-error: true + + steps: + - name: Clone + uses: actions/checkout@v3 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) + + ubuntu-22-cmake-sycl-fp16: + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + dwhisper_sycl: [ON] + dcmake_c_compiler: [icx] + dcmake_cxx_compiler: [icpx] + arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le] + + continue-on-error: true + + steps: + - name: Clone + uses: actions/checkout@v3 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) + windows: runs-on: windows-latest diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c6a844b..7d95d3d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,12 +70,14 @@ if (APPLE) option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF) option(WHISPER_METAL_EMBED_LIBRARY "whisper: embed Metal library" OFF) else() - option(WHISPER_BLAS "whisper: use BLAS libraries" OFF) - option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic) - option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF) - option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF) - option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF) - option(WHISPER_CLBLAST "whisper: use CLBlast" OFF) + option(WHISPER_BLAS "whisper: use BLAS libraries" OFF) + option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic) + option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF) + option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF) + option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF) + option(WHISPER_CLBLAST "whisper: use CLBlast" OFF) + option(WHISPER_SYCL "whisper: use SYCL" OFF) + option(WHISPER_SYCL_F16 "whisper: use 16 bit floats for sycl calculations" OFF) endif() option(WHISPER_PERF "whisper: enable perf timings" OFF) @@ -106,6 +108,13 @@ endif() find_package(Threads REQUIRED) +#compile flag sycl +if (WHISPER_SYCL) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 11) +endif() + # on APPLE if (APPLE) # include Accelerate framework @@ -309,6 +318,30 @@ if( WHISPER_OPENVINO ) find_package(OpenVINO REQUIRED COMPONENTS Runtime) endif() +if (WHISPER_SYCL) + if ( NOT DEFINED ENV{ONEAPI_ROOT}) + message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") + endif() + #todo: AOT + + find_package(IntelSYCL REQUIRED) + if (WHISPER_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() + add_compile_definitions(GGML_USE_SYCL) + + add_compile_options(-I./) #include DPCT + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + + set(GGML_HEADERS_SYCL ggml-sycl.h) + set(GGML_SOURCES_SYCL ggml-sycl.cpp) + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) +endif() # compiler flags if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -503,6 +536,8 @@ add_library(${TARGET} ${GGML_SOURCES_METAL} ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} + ${GGML_SOURCES_SYCL} + ${GGML_HEADERS_SYCL} whisper.h whisper.cpp ) diff --git a/README_sycl.md b/README_sycl.md new file mode 100644 index 00000000..9ea2a790 --- /dev/null +++ b/README_sycl.md @@ -0,0 +1,249 @@ +# whisper.cpp for SYCL + +[Background](#background) + +[OS](#os) + +[Intel GPU](#intel-gpu) + +[Linux](#linux) + +[Environment Variable](#environment-variable) + +[Known Issue](#known-issue) + +[Todo](#todo) + +## Background + +SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. + +oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. + +Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. + +To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL. + +The whisper.cpp for SYCL is used to support Intel GPUs. + +For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build). + +## OS + +|OS|Status|Verified| +|-|-|-| +|Linux|Support|Ubuntu 22.04| +|Windows|Ongoing| | + + +## Intel GPU + +|Intel GPU| Status | Verified Model| +|-|-|-| +|Intel Data Center Max Series| Support| Max 1550| +|Intel Data Center Flex Series| Support| Flex 170| +|Intel Arc Series| Support| Arc 770| +|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| +|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| + + +## Linux + +### Setup Environment + +1. Install Intel GPU driver. + +a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html). + +Note: for iGPU, please install the client GPU driver. + +b. Add user to group: video, render. + +``` +sudo usermod -aG render username +sudo usermod -aG video username +``` + +Note: re-login to enable it. + +c. Check + +``` +sudo apt install clinfo +sudo clinfo -l +``` + +Output (example): + +``` +Platform #0: Intel(R) OpenCL Graphics + `-- Device #0: Intel(R) Arc(TM) A770 Graphics + + +Platform #0: Intel(R) OpenCL HD Graphics + `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] +``` + +2. Install Intel® oneAPI Base toolkit. + + +a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). + +Recommend to install to default folder: **/opt/intel/oneapi**. + +Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder. + +b. Check + +``` +source /opt/intel/oneapi/setvars.sh + +sycl-ls +``` + +There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. + +Output (example): +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] + +``` + +2. Build locally: + +``` +mkdir -p build +cd build +source /opt/intel/oneapi/setvars.sh + +#for FP16 +#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON + +#for FP32 +cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +#build example/main only +#cmake --build . --config Release --target main + +#build all binary +cmake --build . --config Release -v + +``` + +or + +``` +./examples/sycl/build.sh +``` + +Note: + +- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. + +### Run + +1. Put model file to folder **models** + +2. Enable oneAPI running environment + +``` +source /opt/intel/oneapi/setvars.sh +``` + +3. List device ID + +Run without parameter: + +``` +./build/bin/ls-sycl-device + +or + +./build/bin/main +``` + +Check the ID in startup log, like: + +``` +found 4 SYCL devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, + max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, + max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero running time, recommended | +|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| + +4. Set device ID and execute whisper.cpp + +Set device ID = 0 by **GGML_SYCL_DEVICE=0** + +``` +GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav +``` +or run by script: + +``` +./examples/sycl/run_whisper.sh +``` + + + +5. Check the device ID in output + +Like: +``` +Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device +``` + + +## Environment Variable + +#### Build + +|Name|Value|Function| +|-|-|-| +|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path.
For FP32/FP16, WHISPER_SYCL=ON is mandatory.| +|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.| +|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| +|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path| + +#### Running + + +|Name|Value|Function| +|-|-|-| +|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| +|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| + +## Known Issue + +- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. + + Miss to enable oneAPI running environment. + + Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`. + + +- Hang during startup + + llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. + + Solution: add **--no-mmap**. + +## Todo + +- Support to build in Windows. + +- Support multiple cards. \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 1d417724..f59f6b76 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -79,6 +79,9 @@ else() add_subdirectory(talk) add_subdirectory(talk-llama) add_subdirectory(lsp) + if (LLAMA_SYCL) + add_subdirectory(sycl) + endif() endif() add_subdirectory(wchess) diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt new file mode 100644 index 00000000..3b5721f9 --- /dev/null +++ b/examples/sycl/CMakeLists.txt @@ -0,0 +1,9 @@ +# MIT license +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +set(TARGET ls-sycl-device) +add_executable(${TARGET} ls-sycl-device.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) \ No newline at end of file diff --git a/examples/sycl/README.md b/examples/sycl/README.md new file mode 100644 index 00000000..c5d982ff --- /dev/null +++ b/examples/sycl/README.md @@ -0,0 +1,47 @@ +# llama.cpp/example/sycl + +This example program provide the tools for llama.cpp for SYCL on Intel GPU. + +## Tool + +|Tool Name| Function|Status| +|-|-|-| +|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support| + +### ls-sycl-device + +List all SYCL devices with ID, compute capability, max work group size, ect. + +1. Build the llama.cpp for SYCL for all targets. + +2. Enable oneAPI running environment + +``` +source /opt/intel/oneapi/setvars.sh +``` + +3. Execute + +``` +./build/bin/ls-sycl-device +``` + +Check the ID in startup log, like: + +``` +found 4 SYCL devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, + max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, + max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero running time, recommended | +|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| \ No newline at end of file diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh new file mode 100644 index 00000000..87c3778f --- /dev/null +++ b/examples/sycl/build.sh @@ -0,0 +1,19 @@ +# MIT license +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +mkdir -p build +cd build +source /opt/intel/oneapi/setvars.sh + +#for FP16 +#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference + +#for FP32 +cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +#build example/main only +#cmake --build . --config Release --target main + +#build all binary +cmake --build . --config Release -v \ No newline at end of file diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp new file mode 100644 index 00000000..096a5d52 --- /dev/null +++ b/examples/sycl/ls-sycl-device.cpp @@ -0,0 +1,11 @@ +/*MIT license + Copyright (C) 2024 Intel Corporation + SPDX-License-Identifier: MIT +*/ + +#include "ggml-sycl.h" + +int main(int argc, char ** argv) { + ggml_backend_sycl_print_sycl_devices(); + return 0; +} \ No newline at end of file diff --git a/examples/sycl/run-whisper.sh b/examples/sycl/run-whisper.sh new file mode 100644 index 00000000..80f7e75b --- /dev/null +++ b/examples/sycl/run-whisper.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# MIT license +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" +source /opt/intel/oneapi/setvars.sh + +if [ $# -gt 0 ]; then + export GGML_SYCL_DEVICE=$1 +else + export GGML_SYCL_DEVICE=0 +fi +echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE +#export GGML_SYCL_DEBUG=1 +./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav \ No newline at end of file diff --git a/whisper.cpp b/whisper.cpp index 38c827d2..2e0a6e2e 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -12,6 +12,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #ifdef WHISPER_USE_OPENVINO #include "openvino/whisper-openvino-encoder.h" #endif @@ -1052,6 +1056,16 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params } #endif +#ifdef GGML_USE_SYCL + if (params.use_gpu) { + WHISPER_LOG_INFO("%s: using SYCL backend\n", __func__); + backend_gpu = ggml_backend_sycl_init(params.gpu_device); + if (!backend_gpu) { + WHISPER_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__); + } + } +#endif + if (backend_gpu) { return backend_gpu; }