mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-09 20:13:14 +00:00
rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (llama/12943)
RPC_CMD_SET_TENSOR always returns an empty response and we send this 4 times per token. We can improve TG speed if we don't wait for this empty response. The performance impact of this change depends on the network latency.
This commit is contained in:
parent
33bdbfbb33
commit
fe21ddf0dc
@ -7,7 +7,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define RPC_PROTO_MAJOR_VERSION 1
|
#define RPC_PROTO_MAJOR_VERSION 2
|
||||||
#define RPC_PROTO_MINOR_VERSION 0
|
#define RPC_PROTO_MINOR_VERSION 0
|
||||||
#define RPC_PROTO_PATCH_VERSION 0
|
#define RPC_PROTO_PATCH_VERSION 0
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
||||||
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
// No response
|
||||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
|
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
|
||||||
uint8_t cmd_byte = cmd;
|
uint8_t cmd_byte = cmd;
|
||||||
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
||||||
return false;
|
return false;
|
||||||
@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|||||||
if (!send_data(sock->fd, input, input_size)) {
|
if (!send_data(sock->fd, input, input_size)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
||||||
|
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
||||||
|
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
|
||||||
|
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
||||||
// even if we do, we can skip sending output_size from the server for commands with known output size
|
// even if we do, we can skip sending output_size from the server for commands with known output size
|
||||||
uint64_t out_size;
|
uint64_t out_size;
|
||||||
@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
|||||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
||||||
GGML_ASSERT(status);
|
GGML_ASSERT(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|||||||
if (!server.set_tensor(input)) {
|
if (!server.set_tensor(input)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!send_msg(sockfd, nullptr, 0)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case RPC_CMD_SET_TENSOR_HASH: {
|
case RPC_CMD_SET_TENSOR_HASH: {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user