mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
bench : multi-thread memcpy (#1534)
This commit is contained in:
parent
9ac88f2b57
commit
9befab5ab9
100
whisper.cpp
100
whisper.cpp
@ -6064,7 +6064,9 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
// 1GB array
|
// 1GB array
|
||||||
const size_t size = arr*1e6;
|
const size_t size = arr*1e6;
|
||||||
|
|
||||||
// single-thread
|
double sum = 0.0;
|
||||||
|
|
||||||
|
// heat-up
|
||||||
{
|
{
|
||||||
char * src = (char *) malloc(size);
|
char * src = (char *) malloc(size);
|
||||||
char * dst = (char *) malloc(size);
|
char * dst = (char *) malloc(size);
|
||||||
@ -6074,7 +6076,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
memcpy(dst, src, size); // heat-up
|
memcpy(dst, src, size); // heat-up
|
||||||
|
|
||||||
double tsum = 0.0;
|
double tsum = 0.0;
|
||||||
double sum = 0.0;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (size_t i = 0; i < n; i++) {
|
||||||
const int64_t t0 = ggml_time_us();
|
const int64_t t0 = ggml_time_us();
|
||||||
@ -6088,21 +6089,108 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
src[rand() % size] = rand() % 256;
|
src[rand() % size] = rand() % 256;
|
||||||
}
|
}
|
||||||
|
|
||||||
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
|
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
|
||||||
s += strbuf;
|
s += strbuf;
|
||||||
|
|
||||||
// needed to prevent the compiler from optimizing the memcpy away
|
// needed to prevent the compiler from optimizing the memcpy away
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < size; i++) sum += dst[i];
|
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||||
|
|
||||||
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
|
|
||||||
s += strbuf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(src);
|
free(src);
|
||||||
free(dst);
|
free(dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// single-thread
|
||||||
|
{
|
||||||
|
char * src = (char *) malloc(size);
|
||||||
|
char * dst = (char *) malloc(size);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i++) src[i] = i;
|
||||||
|
|
||||||
|
memcpy(dst, src, size); // heat-up
|
||||||
|
|
||||||
|
double tsum = 0.0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
|
memcpy(dst, src, size);
|
||||||
|
|
||||||
|
const int64_t t1 = ggml_time_us();
|
||||||
|
|
||||||
|
tsum += (t1 - t0)*1e-6;
|
||||||
|
|
||||||
|
src[rand() % size] = rand() % 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
|
||||||
|
s += strbuf;
|
||||||
|
|
||||||
|
// needed to prevent the compiler from optimizing the memcpy away
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
free(src);
|
||||||
|
free(dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
// multi-thread
|
||||||
|
|
||||||
|
for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
|
||||||
|
char * src = (char *) malloc(size);
|
||||||
|
char * dst = (char *) malloc(size);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i++) src[i] = i;
|
||||||
|
|
||||||
|
memcpy(dst, src, size); // heat-up
|
||||||
|
|
||||||
|
double tsum = 0.0;
|
||||||
|
|
||||||
|
auto helper = [&](int th) {
|
||||||
|
const int64_t i0 = (th + 0)*size/n_threads;
|
||||||
|
const int64_t i1 = (th + 1)*size/n_threads;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
memcpy(dst + i0, src + i0, i1 - i0);
|
||||||
|
|
||||||
|
src[i0 + rand() % (i1 - i0)] = rand() % 256;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
|
std::vector<std::thread> threads(n_threads - 1);
|
||||||
|
for (uint32_t th = 0; th < n_threads - 1; ++th) {
|
||||||
|
threads[th] = std::thread(helper, th);
|
||||||
|
}
|
||||||
|
|
||||||
|
helper(n_threads - 1);
|
||||||
|
|
||||||
|
for (uint32_t th = 0; th < n_threads - 1; ++th) {
|
||||||
|
threads[th].join();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t t1 = ggml_time_us();
|
||||||
|
|
||||||
|
tsum += (t1 - t0)*1e-6;
|
||||||
|
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
|
||||||
|
s += strbuf;
|
||||||
|
|
||||||
|
// needed to prevent the compiler from optimizing the memcpy away
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
free(src);
|
||||||
|
free(dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
|
||||||
|
s += strbuf;
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user