RPC server: buffer response messages until the client queue is fully set up.

The issue arises when the server restarts, and the client is sometimes not able to recreate its queue in time, so the server is unable to send back a response message and just drops it, causing the client to hang.
2025-06-21 16:49:45 +00:00 · 2017-06-15 15:19:33 +01:00
parent 4f16512dcf
commit aaf7de0d02
5 changed files with 157 additions and 103 deletions
--- a/client/rpc/src/integration-test/kotlin/net/corda/client/rpc/RPCStabilityTests.kt
+++ b/client/rpc/src/integration-test/kotlin/net/corda/client/rpc/RPCStabilityTests.kt
@ -5,7 +5,6 @@ import com.esotericsoftware.kryo.Serializer
 import com.esotericsoftware.kryo.io.Input
 import com.esotericsoftware.kryo.io.Output
 import com.esotericsoftware.kryo.pool.KryoPool
-import com.google.common.base.Stopwatch
 import com.google.common.net.HostAndPort
 import com.google.common.util.concurrent.Futures
 import net.corda.client.rpc.internal.RPCClient
@ -17,7 +16,6 @@ import net.corda.node.services.messaging.RPCServerConfiguration
 import net.corda.nodeapi.RPCApi
 import net.corda.nodeapi.RPCKryo
 import net.corda.testing.*
-import org.apache.activemq.artemis.ArtemisConstants
 import org.apache.activemq.artemis.api.core.SimpleString
 import org.junit.Assert.assertEquals
 import org.junit.Assert.assertTrue
@ -28,8 +26,6 @@ import rx.subjects.UnicastSubject
 import java.time.Duration
 import java.util.concurrent.*
 import java.util.concurrent.atomic.AtomicInteger
-import kotlin.concurrent.thread
-import kotlin.test.fail

 class RPCStabilityTests {

@ -218,65 +214,27 @@ class RPCStabilityTests {

    @Test
    fun `client reconnects to rebooted server`() {
-        // TODO: Remove multiple trials when we fix the Artemis bug (which should have its own test(s)).
-        if (ArtemisConstants::class.java.`package`.implementationVersion == "1.5.3") {
-            // The test fails maybe 1 in 100 times, so to stay green until we upgrade Artemis, retry if it fails:
-            for (i in (1..3)) {
-                try {
-                    `client reconnects to rebooted server`(1)
-                } catch (e: TimeoutException) {
-                    continue
-                }
-                return
-            }
-            fail("Test failed 3 times, which is vanishingly unlikely unless something has changed.")
-        } else {
-            // We've upgraded Artemis so make the test fail reliably, in the 2.1.0 case that takes 25 trials:
-            `client reconnects to rebooted server`(25)
-        }
-    }
-
-    private fun `client reconnects to rebooted server`(trials: Int) {
        rpcDriver {
-            val coreBurner = thread {
-                while (!Thread.interrupted()) {
-                    // Spin.
-                }
+            val ops = object : ReconnectOps {
+                override val protocolVersion = 0
+                override fun ping() = "pong"
            }
-            try {
-                val ops = object : ReconnectOps {
-                    override val protocolVersion = 0
-                    override fun ping() = "pong"
-                }
-                var serverFollower = shutdownManager.follower()
-                val serverPort = startRpcServer<ReconnectOps>(ops = ops).getOrThrow().broker.hostAndPort!!
-                serverFollower.unfollow()
-                val clientFollower = shutdownManager.follower()
-                val client = startRpcClient<ReconnectOps>(serverPort).getOrThrow()
-                clientFollower.unfollow()
-                assertEquals("pong", client.ping())
-                val background = Executors.newSingleThreadExecutor()
-                (1..trials).forEach {
-                    System.err.println("Start trial $it of $trials.")
-                    serverFollower.shutdown()
-                    serverFollower = shutdownManager.follower()
-                    startRpcServer<ReconnectOps>(ops = ops, customPort = serverPort).getOrThrow()
-                    serverFollower.unfollow()
-                    val stopwatch = Stopwatch.createStarted()
-                    val pingFuture = background.submit(Callable {
-                        client.ping() // Would also hang in foreground, we need it in background so we can timeout.
-                    })
-                    assertEquals("pong", pingFuture.getOrThrow(10.seconds))
-                    System.err.println("Took ${stopwatch.elapsed(TimeUnit.MILLISECONDS)} millis.")
-                }
-                background.shutdown() // No point in the hanging case.
-                clientFollower.shutdown() // Driver would do this after the current server, causing 'legit' failover hang.
-            } finally {
-                with(coreBurner) {
-                    interrupt()
-                    join()
-                }
+            val serverFollower = shutdownManager.follower()
+            val serverPort = startRpcServer<ReconnectOps>(ops = ops).getOrThrow().broker.hostAndPort!!
+            serverFollower.unfollow()
+            // Set retry interval to 1s to reduce test duration
+            val clientConfiguration = RPCClientConfiguration.default.copy(connectionRetryInterval = 1.seconds)
+            val clientFollower = shutdownManager.follower()
+            val client = startRpcClient<ReconnectOps>(serverPort, configuration = clientConfiguration).getOrThrow()
+            clientFollower.unfollow()
+            assertEquals("pong", client.ping())
+            serverFollower.shutdown()
+            startRpcServer<ReconnectOps>(ops = ops, customPort = serverPort).getOrThrow()
+            val pingFuture = future {
+                client.ping()
            }
+            assertEquals("pong", pingFuture.getOrThrow(10.seconds))
+            clientFollower.shutdown() // Driver would do this after the new server, causing hang.
        }
    }