HIP: Prepare reduction operators for wave 64

This commit is contained in:
uvos 2025-01-29 19:12:42 +01:00 committed by Georgi Gerganov
parent f41fdad200
commit fc2e44490d
2 changed files with 28 additions and 35 deletions

View File

@ -190,53 +190,46 @@ static __device__ void no_device_code(
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__ #endif // __CUDA_ARCH__
template<int width = WARP_SIZE>
static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ int warp_reduce_sum(int x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
return __reduce_add_sync(0xffffffff, x); return __reduce_add_sync(0xffffffff, x);
#else #else
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, offset, 32); x += __shfl_xor_sync(0xffffffff, x, offset, width);
} }
return x; return x;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
} }
template<int width = WARP_SIZE>
static __device__ __forceinline__ float warp_reduce_sum(float x) { static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, offset, 32); x += __shfl_xor_sync(0xffffffff, x, offset, width);
} }
return x; return x;
} }
template<int width = WARP_SIZE>
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32); a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32); a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
} }
return a; return a;
} }
template<int width = WARP_SIZE>
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32); a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
reinterpret_cast<half&>(a.x) += __low2half(a_other);
reinterpret_cast<half&>(a.y) += __high2half(a_other);
} }
return a; return a;
#else
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
}
return a;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#else #else
NO_DEVICE_CODE; NO_DEVICE_CODE;
@ -244,10 +237,11 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#endif // FP16_AVAILABLE #endif // FP16_AVAILABLE
} }
template<int width = WARP_SIZE>
static __device__ __forceinline__ float warp_reduce_max(float x) { static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
} }
return x; return x;
} }
@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
} }
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
#if CUDART_VERSION >= CUDART_HMAX #elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b); return __hmax2(a, b);
#else #elif !defined(GGML_USE_HIP)
half2 ret; half2 ret;
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
return ret; return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else #else
GGML_UNUSED(a); GGML_UNUSED(a);
GGML_UNUSED(b); GGML_UNUSED(b);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif
} }
template<int width = WARP_SIZE>
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
} }
return x; return x;
#else #else
GGML_UNUSED(x); GGML_UNUSED(x);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
} }
#if CUDART_VERSION < CUDART_HMASK #if CUDART_VERSION < CUDART_HMASK

View File

@ -240,8 +240,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.default_tensor_split[id] = total_vram; info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].warp_size = prop.warpSize; info.devices[id].warp_size = prop.warpSize;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;