SYCL: Add non contiguous support in RMS_NORM and NORM kernels (llama/13611)

* SYCL: Add non contiguous input support to norm kernel * refactor and add RMS_NORM non contiguous input support ggml-ci * restore subgroup reduction for multi-subgroup thread blocks in norm kernels * Swap grid dims of nsamples and nrows ggml-ci * Revert "Swap grid dims of nsamples and nrows" This reverts commit 43be2d657fec7f7fba54e2cd154106bc0fc45adf. * restore not required changes ggml-ci * address review comments: change it to more like SYCL * Use a common function to calculate offset * remove wrap around logic for handling broadcasts * remove static from calculate_offset fn and use ceil_div
2025-06-22 16:38:58 +00:00 · 2025-05-26 21:10:36 +05:30
parent 25e27904ca
commit 195fde8804
3 changed files with 109 additions and 67 deletions
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -13,6 +13,7 @@
 #ifndef GGML_SYCL_COMMON_HPP
 #define GGML_SYCL_COMMON_HPP

+#include <cstddef>
 #include <fstream>
 #include <iostream>
 #include <string>
@ -481,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
    return x;
 }

+/* Helper for Computing the linear offset of a ggml_tensor given
+per-dimension sizes, strides, and indices */
+template<int N>
+__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
+    size_t offset = 0;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+        auto index_i = indices[i];
+        offset += strides[i] * index_i;
+    }
+    return offset;
+}
+
 // Helper for vec loading aligned data
 template <typename Tp, int n>
 inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {