mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-02-21 09:21:49 +00:00
threads: improve ggml_barrier scaling with large number of threads (llama/9598)
Make sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing. This optimization shows performance improvements even for n_threads <= 8 cases. Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write in the normal case and just use thread-fence as originally intended.
This commit is contained in:
parent
05c6139625
commit
08e8414f27
@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
#define GGML_CACHE_LINE 64
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
|
||||
#endif
|
||||
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(thread_sanitizer)
|
||||
#define GGML_TSAN_ENABLED 1
|
||||
#endif
|
||||
#else // __has_feature
|
||||
#if defined(__SANITIZE_THREAD__)
|
||||
#define GGML_TSAN_ENABLED 1
|
||||
#endif
|
||||
#endif // __has_feature
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
|
||||
#include <windows.h>
|
||||
|
||||
#if !defined(__clang__)
|
||||
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
||||
|
||||
typedef volatile LONG atomic_int;
|
||||
typedef atomic_int atomic_bool;
|
||||
typedef atomic_int atomic_flag;
|
||||
@ -2006,8 +2027,8 @@ struct ggml_threadpool {
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
||||
atomic_int n_barrier;
|
||||
atomic_int n_barrier_passed;
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier;
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||
|
||||
// these are atomic as an annotation for thread-sanitizer
|
||||
@ -3195,20 +3216,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
// enter barrier (full seq-cst fence)
|
||||
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
|
||||
|
||||
int last = 0;
|
||||
if (n_barrier == (n_threads - 1)) {
|
||||
// last thread
|
||||
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
||||
last = 1;
|
||||
} else {
|
||||
// wait for other threads
|
||||
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
|
||||
ggml_thread_cpu_relax();
|
||||
}
|
||||
|
||||
// exit barrier (fill seq-cst fence)
|
||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
||||
return;
|
||||
}
|
||||
|
||||
// wait for other threads
|
||||
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
|
||||
ggml_thread_cpu_relax();
|
||||
}
|
||||
|
||||
// exit barrier (full seq-cst fence)
|
||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
|
||||
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
|
||||
#ifdef GGML_TSAN_ENABLED
|
||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
|
||||
#else
|
||||
atomic_thread_fence(memory_order_seq_cst);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -20239,10 +20267,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
|
||||
|
||||
// sync thread state after polling
|
||||
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
|
||||
// so instead we just use a dummy read-modify-write
|
||||
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
|
||||
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
|
||||
#ifdef GGML_TSAN_ENABLED
|
||||
atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
|
||||
#else
|
||||
atomic_thread_fence(memory_order_seq_cst);
|
||||
#endif
|
||||
UNUSED(state);
|
||||
}
|
||||
|
||||
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user