mirror of
https://github.com/corda/corda.git
synced 2025-05-31 22:50:53 +00:00
CORDA-3841 Check isAnyCheckpointPersisted
in startFlowInternal
(#6351)
Only hit the database if `StateMachineState.isAnyCheckpointPersisted` returns true. Otherwise, there will be no checkpoint to retrieve from the database anyway. This can prevent errors due to a transient loss of connection to the database.
This commit is contained in:
parent
4b63bab4fc
commit
7ab6a8f600
@ -4,6 +4,7 @@ import net.corda.client.rpc.CordaRPCClient
|
||||
import net.corda.core.messaging.startFlow
|
||||
import net.corda.core.utilities.getOrThrow
|
||||
import net.corda.core.utilities.seconds
|
||||
import net.corda.node.services.api.CheckpointStorage
|
||||
import net.corda.node.services.messaging.DeduplicationHandler
|
||||
import net.corda.node.services.statemachine.transitions.TopLevelTransition
|
||||
import net.corda.testing.core.ALICE_NAME
|
||||
@ -11,6 +12,8 @@ import net.corda.testing.core.CHARLIE_NAME
|
||||
import net.corda.testing.core.singleIdentity
|
||||
import org.junit.Ignore
|
||||
import org.junit.Test
|
||||
import java.util.concurrent.ExecutorService
|
||||
import java.util.concurrent.Executors
|
||||
import java.util.concurrent.TimeoutException
|
||||
import kotlin.test.assertEquals
|
||||
import kotlin.test.assertFailsWith
|
||||
@ -18,6 +21,10 @@ import kotlin.test.assertFailsWith
|
||||
@Suppress("MaxLineLength") // Byteman rules cannot be easily wrapped
|
||||
class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
|
||||
private companion object {
|
||||
val executor: ExecutorService = Executors.newSingleThreadExecutor()
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.SendInitial] action.
|
||||
* The exception is thrown 4 times.
|
||||
@ -25,8 +32,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times) and is then kept in
|
||||
* the hospital for observation.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with SendInitial action is retried 3 times and kept for observation if error persists`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with SendInitial action is retried 3 times and kept for observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -79,7 +86,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -105,8 +115,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* succeeds and the flow finishes.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with SendInitial action that does not persist will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with SendInitial action that does not persist will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -158,7 +168,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -185,8 +198,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* The exceptions should be swallowed. Therefore there should be no trips to the hospital and no retries.
|
||||
* The flow should complete successfully as the error is swallowed.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with AcknowledgeMessages action is swallowed and flow completes successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with AcknowledgeMessages action is swallowed and flow completes successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -238,7 +251,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -270,8 +286,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* if an error transition moves into another error transition. The flow still recovers from this state. 5 exceptions were thrown to
|
||||
* verify that 3 retries are attempted before recovering.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs during the beginning of execution will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs during the beginning of execution will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -323,7 +339,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -356,8 +375,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
*
|
||||
* CORDA-3352 - it is currently hanging after putting the flow in for observation
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
@Ignore
|
||||
@Test(timeout = 300_000)
|
||||
@Ignore
|
||||
fun `error during transition with CommitTransaction action that occurs during the beginning of execution will retry and be kept for observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
@ -411,7 +430,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -443,8 +465,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* if an error transition moves into another error transition. The flow still recovers from this state. 5 exceptions were thrown to
|
||||
* verify that 3 retries are attempted before recovering.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs after the first suspend will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs after the first suspend will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -513,7 +535,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -540,8 +565,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
*
|
||||
* Each time the flow retries, it begins from the previous checkpoint where it suspended before failing.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs when completing a flow and deleting its checkpoint will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with CommitTransaction action that occurs when completing a flow and deleting its checkpoint will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -602,7 +627,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -629,8 +657,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* The flow is discharged and replayed from the hospital once. After failing during the replay, the flow is forced into overnight
|
||||
* observation. It is not ran again after this point
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during retry of a flow will force the flow into overnight observation`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during retry of a flow will force the flow into overnight observation`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -699,7 +727,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -729,8 +760,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* flow will still finish successfully. This is due to the even being scheduled as part of the retry and the failure in the database
|
||||
* commit occurs after this point. As the flow is already scheduled, the failure has not affect on it.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during commit transaction action when retrying a flow will retry the flow again and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during commit transaction action when retrying a flow will retry the flow again and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -798,7 +829,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -828,8 +862,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* CORDA-3352 - it is currently hanging after putting the flow in for observation
|
||||
*
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
@Ignore
|
||||
@Test(timeout = 300_000)
|
||||
@Ignore
|
||||
fun `error during retrying a flow that failed when committing its original checkpoint will force the flow into overnight observation`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
@ -883,7 +917,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -910,8 +947,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
*
|
||||
* Each time the flow retries, it begins from the previous checkpoint where it suspended before failing.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `error during transition with CommitTransaction action and ConstraintViolationException that occurs when completing a flow will retry and be kept for observation if error persists`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `error during transition with CommitTransaction action and ConstraintViolationException that occurs when completing a flow will retry and be kept for observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
@ -975,7 +1012,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -994,6 +1034,196 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.CommitTransaction] event before the flow has suspended (remains in an unstarted
|
||||
* state).
|
||||
*
|
||||
* The exception is thrown 5 times.
|
||||
*
|
||||
* An exception is also thrown from [CheckpointStorage.getCheckpoint].
|
||||
*
|
||||
* This test is to prevent a regression, where a transient database connection error can be thrown retrieving a flow's checkpoint when
|
||||
* retrying the flow after it failed to commit it's original checkpoint.
|
||||
*
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* succeeds and the flow finishes.
|
||||
*/
|
||||
@Test(timeout = 300_000)
|
||||
fun `flow can be retried when there is a transient connection error to the database`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
|
||||
val rules = """
|
||||
RULE Create Counter
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF createCounter("counter", $counter)
|
||||
DO traceln("Counter created")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on executeCommitTransaction action
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF readCounter("counter") < 5
|
||||
DO incrementCounter("counter"); traceln("Throwing exception"); throw new java.lang.RuntimeException("die dammit die")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on getCheckpoint
|
||||
INTERFACE ${CheckpointStorage::class.java.name}
|
||||
METHOD getCheckpoint
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Throwing exception getting checkpoint"); throw new java.sql.SQLTransientConnectionException("Connection is not available")
|
||||
ENDRULE
|
||||
|
||||
RULE Entering internal error staff member
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Reached internal transition error staff member")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment discharge counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ DISCHARGE
|
||||
IF true
|
||||
DO traceln("Byteman test - discharging")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment observation counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ OVERNIGHT_OBSERVATION
|
||||
IF true
|
||||
DO traceln("Byteman test - overnight observation")
|
||||
ENDRULE
|
||||
""".trimIndent()
|
||||
|
||||
submitBytemanRules(rules)
|
||||
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
val output = getBytemanOutput(alice)
|
||||
|
||||
// Check the stdout for the lines generated by byteman
|
||||
assertEquals(3, output.filter { it.contains("Byteman test - discharging") }.size)
|
||||
assertEquals(0, output.filter { it.contains("Byteman test - overnight observation") }.size)
|
||||
val (discharge, observation) = aliceClient.startFlow(StatemachineErrorHandlingTest::GetHospitalCountersFlow).returnValue.get()
|
||||
assertEquals(3, discharge)
|
||||
assertEquals(0, observation)
|
||||
assertEquals(0, aliceClient.stateMachinesSnapshot().size)
|
||||
assertEquals(1, aliceClient.startFlow(StatemachineErrorHandlingTest::GetNumberOfCheckpointsFlow).returnValue.get())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.CommitTransaction] event before the flow has suspended (remains in an unstarted
|
||||
* state).
|
||||
*
|
||||
* The exception is thrown 7 times.
|
||||
*
|
||||
* An exception is also thrown from [CheckpointStorage.getCheckpoint].
|
||||
*
|
||||
* This test is to prevent a regression, where a transient database connection error can be thrown retrieving a flow's checkpoint when
|
||||
* retrying the flow after it failed to commit it's original checkpoint.
|
||||
*
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* fails and is kept for in for observation.
|
||||
*/
|
||||
@Test(timeout = 300_000)
|
||||
fun `flow can be retried when there is a transient connection error to the database goes to observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createNode(CHARLIE_NAME)
|
||||
val alice = createBytemanNode(ALICE_NAME)
|
||||
|
||||
val rules = """
|
||||
RULE Create Counter
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF createCounter("counter", $counter)
|
||||
DO traceln("Counter created")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on executeCommitTransaction action
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF readCounter("counter") < 7
|
||||
DO incrementCounter("counter"); traceln("Throwing exception"); throw new java.lang.RuntimeException("die dammit die")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on getCheckpoint
|
||||
INTERFACE ${CheckpointStorage::class.java.name}
|
||||
METHOD getCheckpoint
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Throwing exception getting checkpoint"); throw new java.sql.SQLTransientConnectionException("Connection is not available")
|
||||
ENDRULE
|
||||
|
||||
RULE Entering internal error staff member
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Reached internal transition error staff member")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment discharge counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ DISCHARGE
|
||||
IF true
|
||||
DO traceln("Byteman test - discharging")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment observation counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ OVERNIGHT_OBSERVATION
|
||||
IF true
|
||||
DO traceln("Byteman test - overnight observation")
|
||||
ENDRULE
|
||||
""".trimIndent()
|
||||
|
||||
submitBytemanRules(rules)
|
||||
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
executor.execute {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity())
|
||||
}
|
||||
|
||||
// flow is not signaled as started calls to [getOrThrow] will hang, sleeping instead
|
||||
Thread.sleep(30.seconds.toMillis())
|
||||
|
||||
val output = getBytemanOutput(alice)
|
||||
|
||||
// Check the stdout for the lines generated by byteman
|
||||
assertEquals(3, output.filter { it.contains("Byteman test - discharging") }.size)
|
||||
assertEquals(1, output.filter { it.contains("Byteman test - overnight observation") }.size)
|
||||
val (discharge, observation) = aliceClient.startFlow(StatemachineErrorHandlingTest::GetHospitalCountersFlow).returnValue.get()
|
||||
assertEquals(3, discharge)
|
||||
assertEquals(1, observation)
|
||||
assertEquals(1, aliceClient.stateMachinesSnapshot().size)
|
||||
assertEquals(1, aliceClient.startFlow(StatemachineErrorHandlingTest::GetNumberOfCheckpointsFlow).returnValue.get())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.CommitTransaction] event on a responding flow. The failure prevents the node from saving
|
||||
* its original checkpoint.
|
||||
@ -1009,8 +1239,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* if an error transition moves into another error transition. The flow still recovers from this state. 5 exceptions were thrown to verify
|
||||
* that 3 retries are attempted before recovering.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs during the beginning of execution will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs during the beginning of execution will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createBytemanNode(CHARLIE_NAME)
|
||||
val alice = createNode(ALICE_NAME)
|
||||
@ -1064,7 +1294,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val charlieClient =
|
||||
CordaRPCClient(charlie.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -1104,8 +1337,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* able to recover when the node is restarted (by using the events). The initiating flow maintains the checkpoint as it is waiting for
|
||||
* the responding flow to recover and finish its flow.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs during the beginning of execution will retry and be kept for observation if error persists`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs during the beginning of execution will retry and be kept for observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createBytemanNode(CHARLIE_NAME)
|
||||
val alice = createNode(ALICE_NAME)
|
||||
@ -1160,7 +1393,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
CordaRPCClient(charlie.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
@ -1192,8 +1428,8 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* succeeds and the flow finishes.
|
||||
*/
|
||||
@Test(timeout=300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs when completing a flow and deleting its checkpoint will retry and complete successfully`() {
|
||||
@Test(timeout = 300_000)
|
||||
fun `responding flow - error during transition with CommitTransaction action that occurs when completing a flow and deleting its checkpoint will retry and complete successfully`() {
|
||||
startDriver {
|
||||
val charlie = createBytemanNode(CHARLIE_NAME)
|
||||
val alice = createNode(ALICE_NAME)
|
||||
@ -1258,7 +1494,10 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
val charlieClient =
|
||||
CordaRPCClient(charlie.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(StatemachineErrorHandlingTest::SendAMessageFlow, charlie.nodeInfo.singleIdentity()).returnValue.getOrThrow(
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
@ -1278,4 +1517,202 @@ class StatemachineGeneralErrorHandlingTest : StatemachineErrorHandlingTest() {
|
||||
assertEquals(1, charlieClient.startFlow(StatemachineErrorHandlingTest::GetNumberOfCheckpointsFlow).returnValue.get())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.CommitTransaction] event before the flow has suspended (remains in an unstarted
|
||||
* state) on a responding node.
|
||||
*
|
||||
* The exception is thrown 5 times.
|
||||
*
|
||||
* An exception is also thrown from [CheckpointStorage.getCheckpoint].
|
||||
*
|
||||
* This test is to prevent a regression, where a transient database connection error can be thrown retrieving a flow's checkpoint when
|
||||
* retrying the flow after it failed to commit it's original checkpoint.
|
||||
*
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* succeeds and the flow finishes.
|
||||
*/
|
||||
@Test(timeout = 300_000)
|
||||
fun `responding flow - session init can be retried when there is a transient connection error to the database`() {
|
||||
startDriver {
|
||||
val charlie = createBytemanNode(CHARLIE_NAME)
|
||||
val alice = createNode(ALICE_NAME)
|
||||
|
||||
val rules = """
|
||||
RULE Create Counter
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF createCounter("counter", $counter)
|
||||
DO traceln("Counter created")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on executeCommitTransaction action
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF readCounter("counter") < 5
|
||||
DO incrementCounter("counter"); traceln("Throwing exception"); throw new java.lang.RuntimeException("die dammit die")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on getCheckpoint
|
||||
INTERFACE ${CheckpointStorage::class.java.name}
|
||||
METHOD getCheckpoint
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Throwing exception getting checkpoint"); throw new java.sql.SQLTransientConnectionException("Connection is not available")
|
||||
ENDRULE
|
||||
|
||||
RULE Entering internal error staff member
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Reached internal transition error staff member")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment discharge counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ DISCHARGE
|
||||
IF true
|
||||
DO traceln("Byteman test - discharging")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment observation counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ OVERNIGHT_OBSERVATION
|
||||
IF true
|
||||
DO traceln("Byteman test - overnight observation")
|
||||
ENDRULE
|
||||
""".trimIndent()
|
||||
|
||||
submitBytemanRules(rules)
|
||||
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
val charlieClient =
|
||||
CordaRPCClient(charlie.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
|
||||
val output = getBytemanOutput(charlie)
|
||||
|
||||
// Check the stdout for the lines generated by byteman
|
||||
assertEquals(3, output.filter { it.contains("Byteman test - discharging") }.size)
|
||||
assertEquals(0, output.filter { it.contains("Byteman test - overnight observation") }.size)
|
||||
val (discharge, observation) = charlieClient.startFlow(StatemachineErrorHandlingTest::GetHospitalCountersFlow).returnValue.get()
|
||||
assertEquals(3, discharge)
|
||||
assertEquals(0, observation)
|
||||
assertEquals(0, aliceClient.stateMachinesSnapshot().size)
|
||||
assertEquals(0, charlieClient.stateMachinesSnapshot().size)
|
||||
assertEquals(1, charlieClient.startFlow(StatemachineErrorHandlingTest::GetNumberOfCheckpointsFlow).returnValue.get())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception when performing an [Action.CommitTransaction] event before the flow has suspended (remains in an unstarted
|
||||
* state) on a responding node.
|
||||
*
|
||||
* The exception is thrown 7 times.
|
||||
*
|
||||
* An exception is also thrown from [CheckpointStorage.getCheckpoint].
|
||||
*
|
||||
* This test is to prevent a regression, where a transient database connection error can be thrown retrieving a flow's checkpoint when
|
||||
* retrying the flow after it failed to commit it's original checkpoint.
|
||||
*
|
||||
* This causes the transition to be discharged from the hospital 3 times (retries 3 times). On the final retry the transition
|
||||
* fails and is kept for in for observation.
|
||||
*/
|
||||
@Test(timeout = 300_000)
|
||||
fun `responding flow - session init can be retried when there is a transient connection error to the database goes to observation if error persists`() {
|
||||
startDriver {
|
||||
val charlie = createBytemanNode(CHARLIE_NAME)
|
||||
val alice = createNode(ALICE_NAME)
|
||||
|
||||
val rules = """
|
||||
RULE Create Counter
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF createCounter("counter", $counter)
|
||||
DO traceln("Counter created")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on executeCommitTransaction action
|
||||
CLASS ${ActionExecutorImpl::class.java.name}
|
||||
METHOD executeCommitTransaction
|
||||
AT ENTRY
|
||||
IF readCounter("counter") < 7
|
||||
DO incrementCounter("counter"); traceln("Throwing exception"); throw new java.lang.RuntimeException("die dammit die")
|
||||
ENDRULE
|
||||
|
||||
RULE Throw exception on getCheckpoint
|
||||
INTERFACE ${CheckpointStorage::class.java.name}
|
||||
METHOD getCheckpoint
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Throwing exception getting checkpoint"); throw new java.sql.SQLTransientConnectionException("Connection is not available")
|
||||
ENDRULE
|
||||
|
||||
RULE Entering internal error staff member
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT ENTRY
|
||||
IF true
|
||||
DO traceln("Reached internal transition error staff member")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment discharge counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ DISCHARGE
|
||||
IF true
|
||||
DO traceln("Byteman test - discharging")
|
||||
ENDRULE
|
||||
|
||||
RULE Increment observation counter
|
||||
CLASS ${StaffedFlowHospital.TransitionErrorGeneralPractitioner::class.java.name}
|
||||
METHOD consult
|
||||
AT READ OVERNIGHT_OBSERVATION
|
||||
IF true
|
||||
DO traceln("Byteman test - overnight observation")
|
||||
ENDRULE
|
||||
""".trimIndent()
|
||||
|
||||
submitBytemanRules(rules)
|
||||
|
||||
val aliceClient =
|
||||
CordaRPCClient(alice.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
val charlieClient =
|
||||
CordaRPCClient(charlie.rpcAddress).start(rpcUser.username, rpcUser.password).proxy
|
||||
|
||||
assertFailsWith<TimeoutException> {
|
||||
aliceClient.startFlow(
|
||||
StatemachineErrorHandlingTest::SendAMessageFlow,
|
||||
charlie.nodeInfo.singleIdentity()
|
||||
).returnValue.getOrThrow(
|
||||
30.seconds
|
||||
)
|
||||
}
|
||||
|
||||
val output = getBytemanOutput(charlie)
|
||||
|
||||
// Check the stdout for the lines generated by byteman
|
||||
assertEquals(3, output.filter { it.contains("Byteman test - discharging") }.size)
|
||||
assertEquals(1, output.filter { it.contains("Byteman test - overnight observation") }.size)
|
||||
val (discharge, observation) = charlieClient.startFlow(StatemachineErrorHandlingTest::GetHospitalCountersFlow).returnValue.get()
|
||||
assertEquals(3, discharge)
|
||||
assertEquals(1, observation)
|
||||
assertEquals(1, aliceClient.stateMachinesSnapshot().size)
|
||||
assertEquals(1, charlieClient.stateMachinesSnapshot().size)
|
||||
assertEquals(1, charlieClient.startFlow(StatemachineErrorHandlingTest::GetNumberOfCheckpointsFlow).returnValue.get())
|
||||
}
|
||||
}
|
||||
}
|
@ -66,6 +66,7 @@ class DBCheckpointStorage : CheckpointStorage {
|
||||
return session.createQuery(delete).executeUpdate() > 0
|
||||
}
|
||||
|
||||
@Throws(SQLException::class)
|
||||
override fun getCheckpoint(id: StateMachineRunId): SerializedBytes<Checkpoint>? {
|
||||
val bytes = currentDBSession().get(DBCheckpoint::class.java, id.uuid.toString())?.checkpoint ?: return null
|
||||
return SerializedBytes(bytes)
|
||||
|
@ -607,9 +607,8 @@ class SingleThreadedStateMachineManager(
|
||||
|
||||
val flowCorDappVersion = createSubFlowVersion(serviceHub.cordappProvider.getCordappForFlow(flowLogic), serviceHub.myInfo.platformVersion)
|
||||
|
||||
val flowAlreadyExists = mutex.locked { flows[flowId] != null }
|
||||
|
||||
val existingCheckpoint = if (flowAlreadyExists) {
|
||||
val existingFlow = mutex.locked { flows[flowId] }
|
||||
val existingCheckpoint = if (existingFlow != null && existingFlow.fiber.transientState?.value?.isAnyCheckpointPersisted == true) {
|
||||
// Load the flow's checkpoint
|
||||
// The checkpoint will be missing if the flow failed before persisting the original checkpoint
|
||||
// CORDA-3359 - Do not start/retry a flow that failed after deleting its checkpoint (the whole of the flow might replay)
|
||||
@ -617,8 +616,10 @@ class SingleThreadedStateMachineManager(
|
||||
val checkpoint = tryCheckpointDeserialize(serializedCheckpoint, flowId)
|
||||
if (checkpoint == null) {
|
||||
return openFuture<FlowStateMachine<A>>().mapError {
|
||||
IllegalStateException("Unable to deserialize database checkpoint for flow $flowId. " +
|
||||
"Something is very wrong. The flow will not retry.")
|
||||
IllegalStateException(
|
||||
"Unable to deserialize database checkpoint for flow $flowId. " +
|
||||
"Something is very wrong. The flow will not retry."
|
||||
)
|
||||
}
|
||||
} else {
|
||||
checkpoint
|
||||
@ -628,6 +629,7 @@ class SingleThreadedStateMachineManager(
|
||||
// This is a brand new flow
|
||||
null
|
||||
}
|
||||
|
||||
val checkpoint = existingCheckpoint ?: Checkpoint.create(
|
||||
invocationContext,
|
||||
flowStart,
|
||||
|
@ -146,6 +146,8 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging,
|
||||
val payload = RejectSessionMessage(message, secureRandom.nextLong())
|
||||
val replyError = ExistingSessionMessage(sessionMessage.initiatorSessionId, payload)
|
||||
|
||||
log.info("Sending session initiation error back to $sender", error)
|
||||
|
||||
flowMessaging.sendSessionMessage(sender, replyError, SenderDeduplicationId(DeduplicationId.createRandom(secureRandom), ourSenderUUID))
|
||||
event.deduplicationHandler.afterDatabaseTransaction()
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user