mirror of
https://github.com/corda/corda.git
synced 2024-12-19 04:57:58 +00:00
CORDA-3901 Eliminate random reconnect test (#6446)
Remove a legacy test for RPC reconnection, which takes 5 minutes to run a random set of tests. This is expensive and low value.
This commit is contained in:
parent
2fa6b5a208
commit
5d7060ec3a
@ -1,355 +0,0 @@
|
||||
package net.corda.node.services.rpc
|
||||
|
||||
import net.corda.client.rpc.CordaRPCClient
|
||||
import net.corda.client.rpc.CordaRPCClientConfiguration
|
||||
import net.corda.client.rpc.GracefulReconnect
|
||||
import net.corda.client.rpc.internal.ReconnectingCordaRPCOps
|
||||
import net.corda.client.rpc.notUsed
|
||||
import net.corda.core.contracts.Amount
|
||||
import net.corda.core.flows.StateMachineRunId
|
||||
import net.corda.core.internal.concurrent.transpose
|
||||
import net.corda.core.messaging.StateMachineUpdate
|
||||
import net.corda.core.node.services.Vault
|
||||
import net.corda.core.node.services.vault.PageSpecification
|
||||
import net.corda.core.node.services.vault.QueryCriteria
|
||||
import net.corda.core.node.services.vault.builder
|
||||
import net.corda.core.utilities.NetworkHostAndPort
|
||||
import net.corda.core.utilities.OpaqueBytes
|
||||
import net.corda.core.utilities.contextLogger
|
||||
import net.corda.core.utilities.getOrThrow
|
||||
import net.corda.core.utilities.seconds
|
||||
import net.corda.finance.contracts.asset.Cash
|
||||
import net.corda.finance.flows.CashIssueAndPaymentFlow
|
||||
import net.corda.finance.schemas.CashSchemaV1
|
||||
import net.corda.node.services.Permissions
|
||||
import net.corda.node.services.rpc.RpcReconnectTests.Companion.NUMBER_OF_FLOWS_TO_RUN
|
||||
import net.corda.testing.core.DUMMY_BANK_A_NAME
|
||||
import net.corda.testing.core.DUMMY_BANK_B_NAME
|
||||
import net.corda.testing.driver.DriverParameters
|
||||
import net.corda.testing.driver.NodeHandle
|
||||
import net.corda.testing.driver.OutOfProcess
|
||||
import net.corda.testing.driver.driver
|
||||
import net.corda.testing.driver.internal.OutOfProcessImpl
|
||||
import net.corda.testing.driver.internal.incrementalPortAllocation
|
||||
import net.corda.testing.node.User
|
||||
import net.corda.testing.node.internal.FINANCE_CORDAPPS
|
||||
import org.assertj.core.api.Assertions.assertThat
|
||||
import org.junit.Test
|
||||
import java.util.*
|
||||
import java.util.concurrent.CountDownLatch
|
||||
import java.util.concurrent.TimeUnit
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
import kotlin.concurrent.thread
|
||||
import kotlin.math.absoluteValue
|
||||
import kotlin.math.max
|
||||
import kotlin.test.assertEquals
|
||||
import kotlin.test.assertTrue
|
||||
import kotlin.test.currentStackTrace
|
||||
|
||||
/**
|
||||
* This is a stress test for the rpc reconnection logic, which triggers failures in a probabilistic way.
|
||||
*
|
||||
* You can adjust the variable [NUMBER_OF_FLOWS_TO_RUN] to adjust the number of flows to run and the duration of the test.
|
||||
*/
|
||||
class RpcReconnectTests {
|
||||
|
||||
companion object {
|
||||
// this many flows take ~5 minutes
|
||||
const val NUMBER_OF_FLOWS_TO_RUN = 100
|
||||
|
||||
private val log = contextLogger()
|
||||
}
|
||||
|
||||
private val portAllocator = incrementalPortAllocation()
|
||||
|
||||
private lateinit var proxy: RandomFailingProxy
|
||||
private lateinit var node: NodeHandle
|
||||
private lateinit var currentAddressPair: AddressPair
|
||||
|
||||
/**
|
||||
* This test showcases and stress tests the demo [ReconnectingCordaRPCOps].
|
||||
*
|
||||
* Note that during node failure events can be lost and starting flows can become unreliable.
|
||||
* The only available way to retry failed flows is to attempt a "logical retry" which is also showcased.
|
||||
*
|
||||
* This test runs flows in a loop and in the background kills the node or restarts it.
|
||||
* Also the RPC connection is made through a proxy that introduces random latencies and is also periodically killed.
|
||||
*/
|
||||
@Suppress("ComplexMethod")
|
||||
@Test(timeout=420_000)
|
||||
fun `test that the RPC client is able to reconnect and proceed after node failure, restart, or connection reset`() {
|
||||
val nodeRunningTime = { Random().nextInt(12000) + 8000 }
|
||||
|
||||
val demoUser = User("demo", "demo", setOf(Permissions.all()))
|
||||
|
||||
// When this reaches 0 - the test will end.
|
||||
val flowsCountdownLatch = CountDownLatch(NUMBER_OF_FLOWS_TO_RUN)
|
||||
// These are the expected progress steps for the CashIssueAndPayFlow.
|
||||
val expectedProgress = listOf(
|
||||
"Starting",
|
||||
"Issuing cash",
|
||||
"Generating transaction",
|
||||
"Signing transaction",
|
||||
"Finalising transaction",
|
||||
"Broadcasting transaction to participants",
|
||||
"Paying recipient",
|
||||
"Generating anonymous identities",
|
||||
"Generating transaction",
|
||||
"Signing transaction",
|
||||
"Finalising transaction",
|
||||
"Requesting signature by notary service",
|
||||
"Requesting signature by Notary service",
|
||||
"Validating response from Notary service",
|
||||
"Broadcasting transaction to participants",
|
||||
"Done"
|
||||
)
|
||||
|
||||
driver(DriverParameters(cordappsForAllNodes = FINANCE_CORDAPPS, startNodesInProcess = false, inMemoryDB = false)) {
|
||||
fun startBankA(address: NetworkHostAndPort) = startNode(providedName = DUMMY_BANK_A_NAME, rpcUsers = listOf(demoUser), customOverrides = mapOf("rpcSettings.address" to address.toString()))
|
||||
fun startProxy(addressPair: AddressPair) = RandomFailingProxy(serverPort = addressPair.proxyAddress.port, remotePort = addressPair.nodeAddress.port).start()
|
||||
|
||||
val addresses = (1..2).map { getRandomAddressPair() }
|
||||
currentAddressPair = addresses[0]
|
||||
|
||||
proxy = startProxy(currentAddressPair)
|
||||
val (bankA, bankB) = listOf(
|
||||
startBankA(currentAddressPair.nodeAddress),
|
||||
startNode(providedName = DUMMY_BANK_B_NAME, rpcUsers = listOf(demoUser))
|
||||
).transpose().getOrThrow()
|
||||
node = bankA
|
||||
|
||||
val notary = defaultNotaryIdentity
|
||||
val baseAmount = Amount.parseCurrency("0 USD")
|
||||
val issuerRef = OpaqueBytes.of(0x01)
|
||||
|
||||
var numDisconnects = 0
|
||||
var numReconnects = 0
|
||||
val maxStackOccurrences = AtomicInteger()
|
||||
|
||||
val addressesForRpc = addresses.map { it.proxyAddress }
|
||||
// DOCSTART rpcReconnectingRPC
|
||||
val onReconnect = {
|
||||
numReconnects++
|
||||
// We only expect to see a single reconnectOnError in the stack trace. Otherwise we're in danger of stack overflow recursion
|
||||
maxStackOccurrences.set(max(maxStackOccurrences.get(), currentStackTrace().count { it.methodName == "reconnectOnError" }))
|
||||
Unit
|
||||
}
|
||||
val reconnect = GracefulReconnect(onDisconnect = { numDisconnects++ }, onReconnect = onReconnect)
|
||||
val config = CordaRPCClientConfiguration.DEFAULT.copy(
|
||||
connectionRetryInterval = 1.seconds,
|
||||
connectionRetryIntervalMultiplier = 1.0
|
||||
)
|
||||
val client = CordaRPCClient(addressesForRpc, configuration = config)
|
||||
val bankAReconnectingRPCConnection = client.start(demoUser.username, demoUser.password, gracefulReconnect = reconnect)
|
||||
val bankAReconnectingRpc = bankAReconnectingRPCConnection.proxy as ReconnectingCordaRPCOps
|
||||
// DOCEND rpcReconnectingRPC
|
||||
|
||||
// Observe the vault and collect the observations.
|
||||
val vaultEvents = Collections.synchronizedList(mutableListOf<Vault.Update<Cash.State>>())
|
||||
// DOCSTART rpcReconnectingRPCVaultTracking
|
||||
val vaultFeed = bankAReconnectingRpc.vaultTrackByWithPagingSpec(
|
||||
Cash.State::class.java,
|
||||
QueryCriteria.VaultQueryCriteria(),
|
||||
PageSpecification(1, 1))
|
||||
val vaultSubscription = vaultFeed.updates.subscribe { update: Vault.Update<Cash.State> ->
|
||||
log.info("vault update produced ${update.produced.map { it.state.data.amount }} consumed ${update.consumed.map { it.ref }}")
|
||||
vaultEvents.add(update)
|
||||
}
|
||||
// DOCEND rpcReconnectingRPCVaultTracking
|
||||
|
||||
// Observe the stateMachine and collect the observations.
|
||||
val stateMachineEvents = Collections.synchronizedList(mutableListOf<StateMachineUpdate>())
|
||||
val stateMachineSubscription = bankAReconnectingRpc.stateMachinesFeed().updates.subscribe { update ->
|
||||
log.info(update.toString())
|
||||
stateMachineEvents.add(update)
|
||||
}
|
||||
|
||||
// While the flows are running, randomly apply a different failure scenario.
|
||||
val nrRestarts = AtomicInteger()
|
||||
thread(name = "Node killer") {
|
||||
while (true) {
|
||||
if (flowsCountdownLatch.count == 0L) break
|
||||
|
||||
// Let the node run for a random time interval.
|
||||
nodeRunningTime().also { ms ->
|
||||
log.info("Running node for ${ms / 1000} s.")
|
||||
Thread.sleep(ms.toLong())
|
||||
}
|
||||
|
||||
if (flowsCountdownLatch.count == 0L) break
|
||||
when (Random().nextInt().rem(7).absoluteValue) {
|
||||
0 -> {
|
||||
log.info("Forcefully killing node and proxy.")
|
||||
(node as OutOfProcessImpl).onStopCallback()
|
||||
(node as OutOfProcess).process.destroyForcibly()
|
||||
proxy.stop()
|
||||
node = startBankA(currentAddressPair.nodeAddress).get()
|
||||
proxy.start()
|
||||
}
|
||||
1 -> {
|
||||
log.info("Forcefully killing node.")
|
||||
(node as OutOfProcessImpl).onStopCallback()
|
||||
(node as OutOfProcess).process.destroyForcibly()
|
||||
node = startBankA(currentAddressPair.nodeAddress).get()
|
||||
}
|
||||
2 -> {
|
||||
log.info("Shutting down node.")
|
||||
node.stop()
|
||||
proxy.stop()
|
||||
node = startBankA(currentAddressPair.nodeAddress).get()
|
||||
proxy.start()
|
||||
}
|
||||
3, 4 -> {
|
||||
log.info("Killing proxy.")
|
||||
proxy.stop()
|
||||
Thread.sleep(Random().nextInt(5000).toLong())
|
||||
proxy.start()
|
||||
}
|
||||
5 -> {
|
||||
log.info("Dropping connection.")
|
||||
proxy.failConnection()
|
||||
}
|
||||
6 -> {
|
||||
log.info("Performing failover to a different node")
|
||||
node.stop()
|
||||
proxy.stop()
|
||||
currentAddressPair = (addresses - currentAddressPair).first()
|
||||
node = startBankA(currentAddressPair.nodeAddress).get()
|
||||
proxy = startProxy(currentAddressPair)
|
||||
}
|
||||
}
|
||||
nrRestarts.incrementAndGet()
|
||||
}
|
||||
}
|
||||
|
||||
// Start nrOfFlowsToRun and provide a logical retry function that checks the vault.
|
||||
val flowProgressEvents = mutableMapOf<StateMachineRunId, MutableList<String>>()
|
||||
for (amount in (1..NUMBER_OF_FLOWS_TO_RUN)) {
|
||||
// DOCSTART rpcReconnectingRPCFlowStarting
|
||||
bankAReconnectingRpc.runFlowWithLogicalRetry(
|
||||
runFlow = { rpc ->
|
||||
log.info("Starting CashIssueAndPaymentFlow for $amount")
|
||||
val flowHandle = rpc.startTrackedFlowDynamic(
|
||||
CashIssueAndPaymentFlow::class.java,
|
||||
baseAmount.plus(Amount.parseCurrency("$amount USD")),
|
||||
issuerRef,
|
||||
bankB.nodeInfo.legalIdentities.first(),
|
||||
false,
|
||||
notary
|
||||
)
|
||||
val flowId = flowHandle.id
|
||||
log.info("Started flow $amount with flowId: $flowId")
|
||||
flowProgressEvents.addEvent(flowId, null)
|
||||
|
||||
flowHandle.stepsTreeFeed?.updates?.notUsed()
|
||||
flowHandle.stepsTreeIndexFeed?.updates?.notUsed()
|
||||
// No reconnecting possible.
|
||||
flowHandle.progress.subscribe(
|
||||
{ prog ->
|
||||
flowProgressEvents.addEvent(flowId, prog)
|
||||
log.info("Progress $flowId : $prog")
|
||||
},
|
||||
{ error ->
|
||||
log.error("Error thrown in the flow progress observer", error)
|
||||
})
|
||||
flowHandle.id
|
||||
},
|
||||
hasFlowStarted = { rpc ->
|
||||
// Query for a state that is the result of this flow.
|
||||
val criteria = QueryCriteria.VaultCustomQueryCriteria(builder { CashSchemaV1.PersistentCashState::pennies.equal(amount.toLong() * 100) }, status = Vault.StateStatus.ALL)
|
||||
val results = rpc.vaultQueryByCriteria(criteria, Cash.State::class.java)
|
||||
log.info("$amount - Found states ${results.states}")
|
||||
// The flow has completed if a state is found
|
||||
results.states.isNotEmpty()
|
||||
},
|
||||
onFlowConfirmed = {
|
||||
flowsCountdownLatch.countDown()
|
||||
log.info("Flow started for $amount. Remaining flows: ${flowsCountdownLatch.count}")
|
||||
}
|
||||
)
|
||||
// DOCEND rpcReconnectingRPCFlowStarting
|
||||
|
||||
Thread.sleep(Random().nextInt(250).toLong())
|
||||
}
|
||||
|
||||
log.info("Started all flows")
|
||||
|
||||
// Wait until all flows have been started.
|
||||
val flowsConfirmed = flowsCountdownLatch.await(10, TimeUnit.MINUTES)
|
||||
|
||||
if (flowsConfirmed) {
|
||||
log.info("Confirmed all flows have started.")
|
||||
} else {
|
||||
log.info("Timed out waiting for confirmation that all flows have started. Remaining flows: ${flowsCountdownLatch.count}")
|
||||
}
|
||||
|
||||
|
||||
// Wait for all events to come in and flows to finish.
|
||||
Thread.sleep(4000)
|
||||
|
||||
val nrFailures = nrRestarts.get()
|
||||
log.info("Checking results after $nrFailures restarts.")
|
||||
|
||||
// We should get one disconnect and one reconnect for each failure
|
||||
assertThat(numDisconnects).isEqualTo(numReconnects)
|
||||
assertThat(numReconnects).isLessThanOrEqualTo(nrFailures)
|
||||
assertThat(maxStackOccurrences.get()).isLessThan(2)
|
||||
|
||||
// Query the vault and check that states were created for all flows.
|
||||
fun readCashStates() = bankAReconnectingRpc
|
||||
.vaultQueryByWithPagingSpec(Cash.State::class.java, QueryCriteria.VaultQueryCriteria(status = Vault.StateStatus.CONSUMED), PageSpecification(1, 10000))
|
||||
.states
|
||||
|
||||
var allCashStates = readCashStates()
|
||||
var nrRetries = 0
|
||||
|
||||
// It might be necessary to wait more for all events to arrive when the node is slow.
|
||||
while (allCashStates.size < NUMBER_OF_FLOWS_TO_RUN && nrRetries++ < 50) {
|
||||
Thread.sleep(2000)
|
||||
allCashStates = readCashStates()
|
||||
}
|
||||
|
||||
val allCash = allCashStates.map { it.state.data.amount.quantity }.toSet()
|
||||
val missingCash = (1..NUMBER_OF_FLOWS_TO_RUN).filterNot { allCash.contains(it.toLong() * 100) }
|
||||
log.info("Missing cash states: $missingCash")
|
||||
|
||||
assertEquals(NUMBER_OF_FLOWS_TO_RUN, allCashStates.size, "Not all flows were executed successfully")
|
||||
|
||||
// The progress status for each flow can only miss the last events, because the node might have been killed.
|
||||
val missingProgressEvents = flowProgressEvents.filterValues { expectedProgress.subList(0, it.size) != it }
|
||||
assertTrue(missingProgressEvents.isEmpty(), "The flow progress tracker is missing events: $missingProgressEvents")
|
||||
|
||||
// DOCSTART missingVaultEvents
|
||||
// Check that enough vault events were received.
|
||||
// This check is fuzzy because events can go missing during node restarts.
|
||||
// Ideally there should be nrOfFlowsToRun events receive but some might get lost for each restart.
|
||||
assertThat(vaultEvents!!.size + nrFailures * 3).isGreaterThanOrEqualTo(NUMBER_OF_FLOWS_TO_RUN)
|
||||
// DOCEND missingVaultEvents
|
||||
|
||||
// Check that no flow was triggered twice.
|
||||
val duplicates = allCashStates.groupBy { it.state.data.amount }.filterValues { it.size > 1 }
|
||||
assertTrue(duplicates.isEmpty(), "${duplicates.size} flows were retried illegally.")
|
||||
|
||||
log.info("State machine events seen: ${stateMachineEvents!!.size}")
|
||||
// State machine events are very likely to get lost more often because they seem to be sent with a delay.
|
||||
assertThat(stateMachineEvents.count { it is StateMachineUpdate.Added }).isGreaterThanOrEqualTo(NUMBER_OF_FLOWS_TO_RUN / 3)
|
||||
assertThat(stateMachineEvents.count { it is StateMachineUpdate.Removed }).isGreaterThanOrEqualTo(NUMBER_OF_FLOWS_TO_RUN / 3)
|
||||
|
||||
// Stop the observers.
|
||||
vaultSubscription.unsubscribe()
|
||||
stateMachineSubscription.unsubscribe()
|
||||
bankAReconnectingRPCConnection.close()
|
||||
}
|
||||
|
||||
proxy.close()
|
||||
}
|
||||
|
||||
@Synchronized
|
||||
fun MutableMap<StateMachineRunId, MutableList<String>>.addEvent(id: StateMachineRunId, progress: String?): Boolean {
|
||||
return getOrPut(id) { mutableListOf() }.let { if (progress != null) it.add(progress) else false }
|
||||
}
|
||||
private fun getRandomAddressPair() = AddressPair(getRandomAddress(), getRandomAddress())
|
||||
private fun getRandomAddress() = NetworkHostAndPort("localhost", portAllocator.nextPort())
|
||||
|
||||
data class AddressPair(val proxyAddress: NetworkHostAndPort, val nodeAddress: NetworkHostAndPort)
|
||||
}
|
Loading…
Reference in New Issue
Block a user