diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 18dfdc64..fb5dd343 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2644,7 +2644,9 @@ void RunServer(const std::string& server_address) { ServerBuilder builder; builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); builder.RegisterService(&service); - + builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB + builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB + builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB std::unique_ptr server(builder.BuildAndStart()); std::cout << "Server listening on " << server_address << std::endl; server->Wait(); diff --git a/backend/python/autogptq/backend.py b/backend/python/autogptq/backend.py index c7c35028..3b5515cb 100755 --- a/backend/python/autogptq/backend.py +++ b/backend/python/autogptq/backend.py @@ -121,7 +121,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return (prompt, image_paths) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/bark/backend.py b/backend/python/bark/backend.py index 050c44ed..49978100 100644 --- a/backend/python/bark/backend.py +++ b/backend/python/bark/backend.py @@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py index 02ab56f4..b602f4de 100644 --- a/backend/python/coqui/backend.py +++ b/backend/python/coqui/backend.py @@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py index 3668b016..7d6a2a17 100755 --- a/backend/python/diffusers/backend.py +++ b/backend/python/diffusers/backend.py @@ -522,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/exllama2/backend.py b/backend/python/exllama2/backend.py index cb21ed7e..7aacea36 100755 --- a/backend/python/exllama2/backend.py +++ b/backend/python/exllama2/backend.py @@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py index dbb8b3d9..b73664ab 100755 --- a/backend/python/faster-whisper/backend.py +++ b/backend/python/faster-whisper/backend.py @@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.TranscriptResult(segments=resultSegments, text=text) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/kokoro/backend.py b/backend/python/kokoro/backend.py index 1fd1feb9..76688185 100755 --- a/backend/python/kokoro/backend.py +++ b/backend/python/kokoro/backend.py @@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/rerankers/backend.py b/backend/python/rerankers/backend.py index e1974ad5..c9a80eab 100755 --- a/backend/python/rerankers/backend.py +++ b/backend/python/rerankers/backend.py @@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.RerankResult(usage=usage, results=results) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index b0d5875b..88b410e5 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): async def serve(address): # Start asyncio gRPC server - server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) # Add the servicer to the server backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) # Bind the server to the address diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 238ba0e3..1ccf6d2a 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -320,7 +320,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): async def serve(address): # Start asyncio gRPC server - server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) # Add the servicer to the server backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) # Bind the server to the address diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go index ca207c3f..fe4dcde4 100644 --- a/pkg/grpc/client.go +++ b/pkg/grpc/client.go @@ -57,7 +57,11 @@ func (c *Client) HealthCheck(ctx context.Context) (bool, error) { } c.setBusy(true) defer c.setBusy(false) - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return false, err } @@ -89,7 +93,11 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ... defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -108,7 +116,11 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -127,7 +139,11 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -145,7 +161,11 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return err } @@ -182,7 +202,11 @@ func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -200,7 +224,11 @@ func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOp defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -218,7 +246,11 @@ func (c *Client) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequ defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -236,7 +268,11 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -254,7 +290,11 @@ func (c *Client) TokenizeString(ctx context.Context, in *pb.PredictOptions, opts defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -276,7 +316,11 @@ func (c *Client) Status(ctx context.Context) (*pb.StatusResponse, error) { } c.setBusy(true) defer c.setBusy(false) - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -294,7 +338,11 @@ func (c *Client) StoresSet(ctx context.Context, in *pb.StoresSetOptions, opts .. defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -312,7 +360,11 @@ func (c *Client) StoresDelete(ctx context.Context, in *pb.StoresDeleteOptions, o defer c.wdUnMark() c.setBusy(true) defer c.setBusy(false) - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -330,7 +382,11 @@ func (c *Client) StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts .. defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -348,7 +404,11 @@ func (c *Client) StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -366,7 +426,11 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc. defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -384,7 +448,11 @@ func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opt defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } @@ -402,7 +470,11 @@ func (c *Client) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOp defer c.setBusy(false) c.wdMark() defer c.wdUnMark() - conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB + )) if err != nil { return nil, err } diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go index 0b2a167f..b81c2c3a 100644 --- a/pkg/grpc/server.go +++ b/pkg/grpc/server.go @@ -244,7 +244,10 @@ func StartServer(address string, model LLM) error { if err != nil { return err } - s := grpc.NewServer() + s := grpc.NewServer( + grpc.MaxRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxSendMsgSize(50*1024*1024), // 50MB + ) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) if err := s.Serve(lis); err != nil { @@ -259,7 +262,10 @@ func RunServer(address string, model LLM) (func() error, error) { if err != nil { return nil, err } - s := grpc.NewServer() + s := grpc.NewServer( + grpc.MaxRecvMsgSize(50*1024*1024), // 50MB + grpc.MaxSendMsgSize(50*1024*1024), // 50MB + ) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) if err = s.Serve(lis); err != nil {