From fe21ddf0dcaf4af68694b8cae8608278266be20c Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Fri, 25 Apr 2025 10:08:08 +0300 Subject: [PATCH] rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (llama/12943) RPC_CMD_SET_TENSOR always returns an empty response and we send this 4 times per token. We can improve TG speed if we don't wait for this empty response. The performance impact of this change depends on the network latency. --- ggml/include/ggml-rpc.h | 2 +- ggml/src/ggml-rpc/ggml-rpc.cpp | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index c8b6097f..1e674112 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -7,7 +7,7 @@ extern "C" { #endif -#define RPC_PROTO_MAJOR_VERSION 1 +#define RPC_PROTO_MAJOR_VERSION 2 #define RPC_PROTO_MINOR_VERSION 0 #define RPC_PROTO_PATCH_VERSION 0 #define GGML_RPC_MAX_SERVERS 16 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index a0667b7d..9023eb09 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int } // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) | -// RPC response: | response_size (8 bytes) | response_data (response_size bytes) | -static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) { +// No response +static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const void * input, size_t input_size) { uint8_t cmd_byte = cmd; if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) { return false; @@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cm if (!send_data(sock->fd, input, input_size)) { return false; } + return true; +} + +// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) | +// RPC response: | response_size (8 bytes) | response_data (response_size bytes) | +static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) { + if (!send_rpc_cmd(sock, cmd, input, input_size)) { + return false; + } // TODO: currently the output_size is always known, do we need support for commands with variable output size? // even if we do, we can skip sending output_size from the server for commands with known output size uint64_t out_size; @@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size); - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0); + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size()); GGML_ASSERT(status); } @@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, if (!server.set_tensor(input)) { return; } - if (!send_msg(sockfd, nullptr, 0)) { - return; - } break; } case RPC_CMD_SET_TENSOR_HASH: {