CORDA-1546: Updated the flow hospital to suspend FinalityHandler if it errors (#3304)

It will re-run automatically from last checkpoint on node restart, allowing the opportunity to resolve the issue, something required when dealing with contract constraint failures.
2025-06-17 06:38:21 +00:00 · 2018-06-07 16:18:00 +01:00
parent 9514ad6be1
commit f6a23a0216
17 changed files with 329 additions and 154 deletions
--- a/core/src/main/kotlin/net/corda/core/flows/ReceiveTransactionFlow.kt
+++ b/core/src/main/kotlin/net/corda/core/flows/ReceiveTransactionFlow.kt
@ -6,6 +6,7 @@ import net.corda.core.internal.ResolveTransactionsFlow
 import net.corda.core.internal.pushToLoggingContext
 import net.corda.core.node.StatesToRecord
 import net.corda.core.transactions.SignedTransaction
+import net.corda.core.utilities.trace
 import net.corda.core.utilities.unwrap
 import java.security.SignatureException

@ -33,9 +34,9 @@ class ReceiveTransactionFlow @JvmOverloads constructor(private val otherSideSess
            TransactionVerificationException::class)
    override fun call(): SignedTransaction {
        if (checkSufficientSignatures) {
-            logger.trace("Receiving a transaction from ${otherSideSession.counterparty}")
+            logger.trace { "Receiving a transaction from ${otherSideSession.counterparty}" }
        } else {
-            logger.trace("Receiving a transaction (but without checking the signatures) from ${otherSideSession.counterparty}")
+            logger.trace { "Receiving a transaction (but without checking the signatures) from ${otherSideSession.counterparty}" }
        }
        val stx = otherSideSession.receive<SignedTransaction>().unwrap {
            it.pushToLoggingContext()
--- a/core/src/main/kotlin/net/corda/core/internal/InternalUtils.kt
+++ b/core/src/main/kotlin/net/corda/core/internal/InternalUtils.kt
@ -378,7 +378,8 @@ fun TransactionBuilder.toWireTransaction(services: ServicesForResolution, serial
 fun TransactionBuilder.toLedgerTransaction(services: ServicesForResolution, serializationContext: SerializationContext) = toLedgerTransactionWithContext(services, serializationContext)

 /** Convenience method to get the package name of a class literal. */
-val KClass<*>.packageName: String get() = java.`package`.name
+val KClass<*>.packageName: String get() = java.packageName
+val Class<*>.packageName: String get() = `package`.name

 inline val Class<*>.isAbstractClass: Boolean get() = Modifier.isAbstract(modifiers)

--- a/core/src/test/kotlin/net/corda/core/serialization/AttachmentSerializationTest.kt
+++ b/core/src/test/kotlin/net/corda/core/serialization/AttachmentSerializationTest.kt
@ -161,12 +161,11 @@ class AttachmentSerializationTest {
    }

    private fun rebootClientAndGetAttachmentContent(checkAttachmentsOnLoad: Boolean = true): String {
-        client.dispose()
-        client = mockNet.createNode(InternalMockNodeParameters(client.internals.id, client.internals.configuration.myLegalName), nodeFactory = { args ->
+        client = mockNet.restartNode(client) { args ->
            object : InternalMockNetwork.MockNode(args) {
                override fun start() = super.start().apply { attachments.checkAttachmentsOnLoad = checkAttachmentsOnLoad }
            }
-        })
+        }
        return (client.smm.allStateMachines[0].stateMachine.resultFuture.apply { mockNet.runNetwork() }.getOrThrow() as ClientResult).attachmentContent
    }

--- a/docs/source/api-contract-constraints.rst
+++ b/docs/source/api-contract-constraints.rst
@ -132,6 +132,22 @@ you require the hash of the node's installed app which supplies the specified co
 hash constraints, you almost always want "whatever the current code is" and not a hard-coded hash. So this automatic
 constraint placeholder is useful.

+FinalityFlow
+------------
+
+It's possible to encounter contract contraint issues when notarising transactions with the ``FinalityFlow`` on a network
+containing multiple versions of the same CorDapp. This will happen when using hash contraints or with zone contraints
+if the zone whitelist has missing CorDapp versions. If a participating party fails to validate the **notarised** transaction
+then we have a scenerio where the members of the network do not have a consistent view of the ledger.
+
+Therfore, if the finality handler flow (which is run on the counterparty) errors for any reason it will always be sent to
+the flow hospital. From there it's suspended waiting to be retried on node restart. This gives the node operator the opportunity
+to recover from those errors, which in the case of contract constraint voilations means either updating the CorDapp or
+adding its hash to the zone whitelist.
+
+.. note:: This is a temporary issue in the current version of Corda, until we implement some missing features which will
+   enable a seemless handling of differences in CorDapp versions.
+
 CorDapps as attachments
 -----------------------

--- a/docs/source/api-flows.rst
+++ b/docs/source/api-flows.rst
@ -581,6 +581,18 @@ We can also choose to send the transaction to additional parties who aren't one
 Only one party has to call ``FinalityFlow`` for a given transaction to be recorded by all participants. It does
 **not** need to be called by each participant individually.

+Because the transaction has already been notarised and the input states consumed, if the participants when receiving the
+transaction fail to verify it, or the receiving flow (the finality handler) fails due to some other error, we then have
+the scenerio where not all parties have the correct up to date view of the ledger. To recover from this the finality handler
+is automatically sent to the flow hospital where it's suspended and retried from its last checkpoint on node restart.
+This gives the node operator the opportunity to recover from the error. Until the issue is resolved the node will continue
+to retry the flow on each startup.
+
+.. note:: It's possible to forcibly terminate the erroring finality handler using the ``killFlow`` RPC but at the risk
+   of an inconsistent view of the ledger.
+
+.. note:: A future release will allow retrying hospitalised flows without restarting the node, i.e. via RPC.
+
 CollectSignaturesFlow/SignTransactionFlow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The list of parties who need to sign a transaction is dictated by the transaction's commands. Once we've signed a
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@ -7,11 +7,19 @@ release, see :doc:`upgrade-notes`.
 Unreleased
 ==========

+* Introducing the flow hospital - a component of the node that manages flows that have errored and whether they should
+  be retried from their previous checkpoints or have their errors propagate. Currently it will respond to any error that
+  occurs during the resolution of a received transaction as part of ``FinalityFlow``. In such a scenerio the receiving
+  flow will be parked and retried on node restart. This is to allow the node operator to rectify the situation as otherwise
+  the node will have an incomplete view of the ledger.
+
 * Fixed an issue preventing out of process nodes started by the ``Driver`` from logging to file.

 * Fixed an issue with ``CashException`` not being able to deserialise after the introduction of AMQP for RPC.

-* Removed -xmx VM argument from Explorer's Capsule setup. This helps avoiding out of memory errors.
+* Removed -Xmx VM argument from Explorer's Capsule setup. This helps avoiding out of memory errors.
+
+* New ``killFlow`` RPC for killing stuck flows.

 * Shell now kills an ongoing flow when CTRL+C is pressed in the terminal.

@ -25,7 +33,8 @@ Unreleased

 * Improved audit trail for ``FinalityFlow`` and related sub-flows.

-* ``NodeStartup`` will now only print node's configuration if ``devMode`` is ``true``, avoiding the risk of printing passwords in a production setup.
+* The node's configuration is only printed on startup if ``devMode`` is ``true``, avoiding the risk of printing passwords
+  in a production setup.

 * SLF4J's MDC will now only be printed to the console if not empty. No more log lines ending with "{}".

--- a/node/src/main/kotlin/net/corda/node/services/statemachine/FlowHospital.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/FlowHospital.kt
@ -1,23 +0,0 @@
-package net.corda.node.services.statemachine
-
-/**
- * A flow hospital is a class that is notified when a flow transitions into an error state due to an uncaught exception
- * or internal error condition, and when it becomes clean again (e.g. due to a resume).
- * Also see [net.corda.node.services.statemachine.interceptors.HospitalisingInterceptor].
- */
-interface FlowHospital {
-    /**
-     * The flow running in [flowFiber] has errored.
-     */
-    fun flowErrored(flowFiber: FlowFiber, currentState: StateMachineState, errors: List<Throwable>)
-
-    /**
-     * The flow running in [flowFiber] has cleaned, possibly as a result of a flow hospital resume.
-     */
-    fun flowCleaned(flowFiber: FlowFiber)
-
-    /**
-     * The flow has been removed from the state machine.
-     */
-    fun flowRemoved(flowFiber: FlowFiber)
-}
--- a/node/src/main/kotlin/net/corda/node/services/statemachine/PropagatingFlowHospital.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/PropagatingFlowHospital.kt
@ -1,25 +0,0 @@
-package net.corda.node.services.statemachine
-
-import net.corda.core.utilities.debug
-import net.corda.core.utilities.loggerFor
-
-/**
- * A simple [FlowHospital] implementation that immediately triggers error propagation when a flow dirties.
- */
-object PropagatingFlowHospital : FlowHospital {
-    private val log = loggerFor<PropagatingFlowHospital>()
-
-    override fun flowErrored(flowFiber: FlowFiber, currentState: StateMachineState, errors: List<Throwable>) {
-        log.debug { "Flow ${flowFiber.id} in state $currentState encountered error" }
-        flowFiber.scheduleEvent(Event.StartErrorPropagation)
-        for ((index, error) in errors.withIndex()) {
-            log.warn("Flow ${flowFiber.id} is propagating error [$index] ", error)
-        }
-    }
-
-    override fun flowCleaned(flowFiber: FlowFiber) {
-        throw IllegalStateException("Flow ${flowFiber.id} cleaned after error propagation triggered")
-    }
-
-    override fun flowRemoved(flowFiber: FlowFiber) {}
-}
--- a/node/src/main/kotlin/net/corda/node/services/statemachine/SingleThreadedStateMachineManager.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/SingleThreadedStateMachineManager.kt
@ -89,6 +89,8 @@ class SingleThreadedStateMachineManager(
        val timedFlows = HashMap<StateMachineRunId, ScheduledTimeout>()
    }

+    override val flowHospital: StaffedFlowHospital = StaffedFlowHospital()
+
    private val mutex = ThreadBox(InnerState())
    private val scheduler = FiberExecutorScheduler("Same thread scheduler", executor)
    private val timeoutScheduler = Executors.newScheduledThreadPool(1)
@ -103,13 +105,11 @@ class SingleThreadedStateMachineManager(
    private val ourSenderUUID = serviceHub.networkService.ourSenderUUID

    private var checkpointSerializationContext: SerializationContext? = null
-    private var tokenizableServices: List<Any>? = null
    private var actionExecutor: ActionExecutor? = null

    override val allStateMachines: List<FlowLogic<*>>
        get() = mutex.locked { flows.values.map { it.fiber.logic } }

-
    private val totalStartedFlows = metrics.counter("Flows.Started")
    private val totalFinishedFlows = metrics.counter("Flows.Finished")

@ -123,7 +123,6 @@ class SingleThreadedStateMachineManager(

    override fun start(tokenizableServices: List<Any>) {
        checkQuasarJavaAgentPresence()
-        this.tokenizableServices = tokenizableServices
        val checkpointSerializationContext = SerializationDefaults.CHECKPOINT_CONTEXT.withTokenContext(
                SerializeAsTokenContextImpl(tokenizableServices, SerializationDefaults.SERIALIZATION_FACTORY, SerializationDefaults.CHECKPOINT_CONTEXT, serviceHub)
        )
@ -753,7 +752,7 @@ class SingleThreadedStateMachineManager(

    private fun makeTransitionExecutor(): TransitionExecutor {
        val interceptors = ArrayList<TransitionInterceptor>()
-        interceptors.add { HospitalisingInterceptor(StaffedFlowHospital, it) }
+        interceptors.add { HospitalisingInterceptor(flowHospital, it) }
        if (serviceHub.configuration.devMode) {
            interceptors.add { DumpHistoryOnErrorInterceptor(it) }
        }
@ -777,7 +776,7 @@ class SingleThreadedStateMachineManager(
        require(lastState.pendingDeduplicationHandlers.isEmpty())
        require(lastState.isRemoved)
        require(lastState.checkpoint.subFlowStack.size == 1)
-        sessionToFlow.none { it.value == flow.fiber.id }
+        require(flow.fiber.id !in sessionToFlow.values)
        flow.resultFuture.set(removalReason.flowReturnValue)
        lastState.flowLogic.progressTracker?.currentStep = ProgressTracker.DONE
        changesPublisher.onNext(StateMachineManager.Change.Removed(lastState.flowLogic, Try.Success(removalReason.flowReturnValue)))
--- a/node/src/main/kotlin/net/corda/node/services/statemachine/StaffedFlowHospital.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/StaffedFlowHospital.kt
@ -1,89 +1,160 @@
 package net.corda.node.services.statemachine

 import net.corda.core.flows.StateMachineRunId
+import net.corda.core.internal.ThreadBox
 import net.corda.core.internal.TimedFlow
-import net.corda.core.utilities.loggerFor
+import net.corda.core.internal.bufferUntilSubscribed
+import net.corda.core.messaging.DataFeed
+import net.corda.core.utilities.contextLogger
+import net.corda.node.services.FinalityHandler
+import org.hibernate.exception.ConstraintViolationException
+import rx.subjects.PublishSubject
 import java.sql.SQLException
 import java.time.Instant
-import java.util.concurrent.ConcurrentHashMap
+import java.util.*

 /**
 * This hospital consults "staff" to see if they can automatically diagnose and treat flows.
 */
-object StaffedFlowHospital : FlowHospital {
-    private val log = loggerFor<StaffedFlowHospital>()
+class StaffedFlowHospital {
+    private companion object {
+        private val log = contextLogger()
+        private val staff = listOf(DeadlockNurse, DuplicateInsertSpecialist, DoctorTimeout, FinalityDoctor)
+    }

-    private val staff = listOf(DeadlockNurse, DuplicateInsertSpecialist, DoctorTimeout)
-
-    private val patients = ConcurrentHashMap<StateMachineRunId, MedicalHistory>()
-
-    val numberOfPatients get() = patients.size
+    private val mutex = ThreadBox(object {
+        val patients = HashMap<StateMachineRunId, MedicalHistory>()
+        val recordsPublisher = PublishSubject.create<MedicalRecord>()
+    })

    class MedicalHistory {
-        val records: MutableList<Record> = mutableListOf()
-
-        sealed class Record(val suspendCount: Int) {
-            class Admitted(val at: Instant, suspendCount: Int) : Record(suspendCount) {
-                override fun toString() = "Admitted(at=$at, suspendCount=$suspendCount)"
-            }
-
-            class Discharged(val at: Instant, suspendCount: Int, val by: Staff, val error: Throwable) : Record(suspendCount) {
-                override fun toString() = "Discharged(at=$at, suspendCount=$suspendCount, by=$by)"
-            }
-        }
+        internal val records: MutableList<MedicalRecord> = mutableListOf()

        fun notDischargedForTheSameThingMoreThan(max: Int, by: Staff): Boolean {
-            val lastAdmittanceSuspendCount = (records.last() as MedicalHistory.Record.Admitted).suspendCount
-            return records.filterIsInstance(MedicalHistory.Record.Discharged::class.java).filter { it.by == by && it.suspendCount == lastAdmittanceSuspendCount }.count() <= max
+            val lastAdmittanceSuspendCount = (records.last() as MedicalRecord.Admitted).suspendCount
+            return records
+                    .filterIsInstance<MedicalRecord.Discharged>()
+                    .count { by in it.by && it.suspendCount == lastAdmittanceSuspendCount } <= max
        }

        override fun toString(): String = "${this.javaClass.simpleName}(records = $records)"
    }

-    override fun flowErrored(flowFiber: FlowFiber, currentState: StateMachineState, errors: List<Throwable>) {
+    /**
+     * The flow running in [flowFiber] has errored.
+     */
+    fun flowErrored(flowFiber: FlowFiber, currentState: StateMachineState, errors: List<Throwable>) {
        log.info("Flow ${flowFiber.id} admitted to hospital in state $currentState")
+        val suspendCount = currentState.checkpoint.numberOfSuspends
+
+        val event = mutex.locked {
            val medicalHistory = patients.computeIfAbsent(flowFiber.id) { MedicalHistory() }
-        medicalHistory.records += MedicalHistory.Record.Admitted(Instant.now(), currentState.checkpoint.numberOfSuspends)
-        for ((index, error) in errors.withIndex()) {
+
+            val admitted = MedicalRecord.Admitted(flowFiber.id, Instant.now(), suspendCount)
+            medicalHistory.records += admitted
+            recordsPublisher.onNext(admitted)
+
+            val report = consultStaff(flowFiber, currentState, errors, medicalHistory)
+
+            val (newRecord, event) = when (report.diagnosis) {
+                Diagnosis.DISCHARGE -> {
+                    log.info("Flow ${flowFiber.id} error discharged from hospital by ${report.by}")
+                    Pair(MedicalRecord.Discharged(flowFiber.id, Instant.now(), suspendCount, report.by, errors), Event.RetryFlowFromSafePoint)
+                }
+                Diagnosis.OVERNIGHT_OBSERVATION -> {
+                    log.info("Flow ${flowFiber.id} error kept for overnight observation by ${report.by}")
+                    // We don't schedule a next event for the flow - it will automatically retry from its checkpoint on node restart
+                    Pair(MedicalRecord.KeptInForObservation(flowFiber.id, Instant.now(), suspendCount, report.by, errors), null)
+                }
+                Diagnosis.NOT_MY_SPECIALTY -> {
+                    // None of the staff care for these errors so we let them propagate
+                    log.info("Flow ${flowFiber.id} error allowed to propagate")
+                    Pair(MedicalRecord.NothingWeCanDo(flowFiber.id, Instant.now(), suspendCount), Event.StartErrorPropagation)
+                }
+            }
+
+            medicalHistory.records += newRecord
+            recordsPublisher.onNext(newRecord)
+            event
+        }
+
+        if (event != null) {
+            flowFiber.scheduleEvent(event)
+        }
+    }
+
+    private fun consultStaff(flowFiber: FlowFiber,
+                             currentState: StateMachineState,
+                             errors: List<Throwable>,
+                             medicalHistory: MedicalHistory): ConsultationReport {
+        return errors
+                .mapIndexed { index, error ->
                    log.info("Flow ${flowFiber.id} has error [$index]", error)
-            if (!errorIsDischarged(flowFiber, currentState, error, medicalHistory)) {
-                // If any error isn't discharged, then we propagate.
-                log.warn("Flow ${flowFiber.id} error was not discharged, propagating.")
-                flowFiber.scheduleEvent(Event.StartErrorPropagation)
-                return
+                    val diagnoses: Map<Diagnosis, List<Staff>> = staff.groupBy { it.consult(flowFiber, currentState, error, medicalHistory) }
+                    // We're only interested in the highest priority diagnosis for the error
+                    val (diagnosis, by) = diagnoses.entries.minBy { it.key }!!
+                    ConsultationReport(error, diagnosis, by)
                }
-        }
-        // If all are discharged, retry.
-        flowFiber.scheduleEvent(Event.RetryFlowFromSafePoint)
+                // And we're only interested in the error with the highest priority diagnosis
+                .minBy { it.diagnosis }!!
    }

-    private fun errorIsDischarged(flowFiber: FlowFiber, currentState: StateMachineState, error: Throwable, medicalHistory: MedicalHistory): Boolean {
-        for (staffMember in staff) {
-            val diagnosis = staffMember.consult(flowFiber, currentState, error, medicalHistory)
-            if (diagnosis == Diagnosis.DISCHARGE) {
-                medicalHistory.records += MedicalHistory.Record.Discharged(Instant.now(), currentState.checkpoint.numberOfSuspends, staffMember, error)
-                log.info("Flow ${flowFiber.id} error discharged from hospital by $staffMember")
-                return true
-            }
-        }
-        return false
-    }
+    private data class ConsultationReport(val error: Throwable, val diagnosis: Diagnosis, val by: List<Staff>)

+    /**
+     * The flow running in [flowFiber] has cleaned, possibly as a result of a flow hospital resume.
+     */
    // It's okay for flows to be cleaned... we fix them now!
-    override fun flowCleaned(flowFiber: FlowFiber) {}
+    fun flowCleaned(flowFiber: FlowFiber) = Unit

-    override fun flowRemoved(flowFiber: FlowFiber) {
-        patients.remove(flowFiber.id)
+    /**
+     * The flow has been removed from the state machine.
+     */
+    fun flowRemoved(flowFiber: FlowFiber) {
+        mutex.locked { patients.remove(flowFiber.id) }
    }

+    // TODO MedicalRecord subtypes can expose the Staff class, something which we probably don't want when wiring this method to RPC
+    /** Returns a stream of medical records as flows pass through the hospital. */
+    fun track(): DataFeed<List<MedicalRecord>, MedicalRecord> {
+        return mutex.locked {
+            DataFeed(patients.values.flatMap { it.records }, recordsPublisher.bufferUntilSubscribed())
+        }
+    }
+
+    sealed class MedicalRecord {
+        abstract val flowId: StateMachineRunId
+        abstract val at: Instant
+        abstract val suspendCount: Int
+
+        data class Admitted(override val flowId: StateMachineRunId,
+                            override val at: Instant,
+                            override val suspendCount: Int) : MedicalRecord()
+
+        data class Discharged(override val flowId: StateMachineRunId,
+                              override val at: Instant,
+                              override val suspendCount: Int,
+                              val by: List<Staff>,
+                              val errors: List<Throwable>) : MedicalRecord()
+
+        data class KeptInForObservation(override val flowId: StateMachineRunId,
+                                        override val at: Instant,
+                                        override val suspendCount: Int,
+                                        val by: List<Staff>,
+                                        val errors: List<Throwable>) : MedicalRecord()
+
+        data class NothingWeCanDo(override val flowId: StateMachineRunId,
+                                  override val at: Instant,
+                                  override val suspendCount: Int) : MedicalRecord()
+    }
+
+    /** The order of the enum values are in priority order. */
    enum class Diagnosis {
-        /**
-         * Retry from last safe point.
-         */
+        /** Retry from last safe point. */
        DISCHARGE,
-        /**
-         * Please try another member of staff.
-         */
+        /** Park and await intervention. */
+        OVERNIGHT_OBSERVATION,
+        /** Please try another member of staff. */
        NOT_MY_SPECIALTY
    }

@ -122,7 +193,7 @@ object StaffedFlowHospital : FlowHospital {
        }

        private fun mentionsConstraintViolation(exception: Throwable?): Boolean {
-            return exception != null && (exception is org.hibernate.exception.ConstraintViolationException || mentionsConstraintViolation(exception.cause))
+            return exception != null && (exception is ConstraintViolationException || mentionsConstraintViolation(exception.cause))
        }
    }

@ -152,4 +223,13 @@ object StaffedFlowHospital : FlowHospital {
            }
        }
    }
+
+    /**
+     * Parks [FinalityHandler]s for observation.
+     */
+    object FinalityDoctor : Staff {
+        override fun consult(flowFiber: FlowFiber, currentState: StateMachineState, newError: Throwable, history: MedicalHistory): Diagnosis {
+            return if (currentState.flowLogic is FinalityHandler) Diagnosis.OVERNIGHT_OBSERVATION else Diagnosis.NOT_MY_SPECIALTY
+        }
+    }
 }
--- a/node/src/main/kotlin/net/corda/node/services/statemachine/StateMachineManager.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/StateMachineManager.kt
@ -82,6 +82,8 @@ interface StateMachineManager {
     * The event may be replayed if a flow fails and attempts to retry.
     */
    fun deliverExternalEvent(event: ExternalEvent)
+
+    val flowHospital: StaffedFlowHospital
 }

 // These must be idempotent! A later failure in the state transition may error the flow state, and a replay may call
--- a/node/src/main/kotlin/net/corda/node/services/statemachine/interceptors/HospitalisingInterceptor.kt
+++ b/node/src/main/kotlin/net/corda/node/services/statemachine/interceptors/HospitalisingInterceptor.kt
@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap
 * transition.
 */
 class HospitalisingInterceptor(
-        private val flowHospital: FlowHospital,
+        private val flowHospital: StaffedFlowHospital,
        private val delegate: TransitionExecutor
 ) : TransitionExecutor {
    private val hospitalisedFlows = ConcurrentHashMap<StateMachineRunId, FlowFiber>()
--- a/node/src/test/kotlin/net/corda/node/services/FinalityHandlerTest.kt
+++ b/node/src/test/kotlin/net/corda/node/services/FinalityHandlerTest.kt
@ -0,0 +1,95 @@
+package net.corda.node.services
+
+import net.corda.core.crypto.SecureHash
+import net.corda.core.flows.FinalityFlow
+import net.corda.core.flows.StateMachineRunId
+import net.corda.core.toFuture
+import net.corda.core.transactions.SignedTransaction
+import net.corda.core.transactions.TransactionBuilder
+import net.corda.core.utilities.getOrThrow
+import net.corda.finance.POUNDS
+import net.corda.finance.contracts.asset.Cash
+import net.corda.finance.issuedBy
+import net.corda.node.internal.StartedNode
+import net.corda.node.services.statemachine.StaffedFlowHospital
+import net.corda.node.services.statemachine.StaffedFlowHospital.MedicalRecord.KeptInForObservation
+import net.corda.testing.core.ALICE_NAME
+import net.corda.testing.core.BOB_NAME
+import net.corda.testing.core.singleIdentity
+import net.corda.testing.node.internal.InternalMockNetwork
+import net.corda.testing.node.internal.InternalMockNodeParameters
+import net.corda.testing.node.internal.startFlow
+import org.assertj.core.api.Assertions.assertThat
+import org.junit.After
+import org.junit.Test
+
+class FinalityHandlerTest {
+    private lateinit var mockNet: InternalMockNetwork
+
+    @After
+    fun cleanUp() {
+        mockNet.stopNodes()
+    }
+
+    @Test
+    fun `sent to flow hospital on error and attempted retry on node restart`() {
+        // Setup a network where only Alice has the finance CorDapp and it sends a cash tx to Bob who doesn't have the
+        // CorDapp. Bob's FinalityHandler will error when validating the tx.
+        mockNet = InternalMockNetwork(cordappPackages = emptyList())
+
+        val alice = mockNet.createNode(InternalMockNodeParameters(
+                legalName = ALICE_NAME,
+                extraCordappPackages = listOf("net.corda.finance.contracts.asset")
+        ))
+
+        var bob = mockNet.createNode(InternalMockNodeParameters(legalName = BOB_NAME))
+
+        val stx = TransactionBuilder(mockNet.defaultNotaryIdentity).let {
+            Cash().generateIssue(
+                    it,
+                    1000.POUNDS.issuedBy(alice.info.singleIdentity().ref(0)),
+                    bob.info.singleIdentity(),
+                    mockNet.defaultNotaryIdentity
+            )
+            alice.services.signInitialTransaction(it)
+        }
+
+        val finalityHandlerIdFuture = bob.smm.track()
+                .updates
+                .filter { it.logic is FinalityHandler }
+                .map { it.logic.runId }
+                .toFuture()
+
+        val finalisedTx = alice.services.startFlow(FinalityFlow(stx)).run {
+            mockNet.runNetwork()
+            resultFuture.getOrThrow()
+        }
+        val finalityHandlerId = finalityHandlerIdFuture.getOrThrow()
+
+        bob.assertFlowSentForObservation(finalityHandlerId)
+        assertThat(bob.getTransaction(finalisedTx.id)).isNull()
+
+        bob = mockNet.restartNode(bob)
+        // Since we've not done anything to fix the orignal error, we expect the finality handler to be sent to the hospital
+        // again on restart
+        bob.assertFlowSentForObservation(finalityHandlerId)
+        assertThat(bob.getTransaction(finalisedTx.id)).isNull()
+    }
+
+    private fun StartedNode<*>.assertFlowSentForObservation(runId: StateMachineRunId) {
+        val keptInForObservation = smm.flowHospital
+                .track()
+                .let { it.updates.startWith(it.snapshot) }
+                .filter { it.flowId == runId }
+                .ofType(KeptInForObservation::class.java)
+                .toBlocking()
+                .first()
+        assertThat(keptInForObservation.by).contains(StaffedFlowHospital.FinalityDoctor)
+    }
+
+    private fun StartedNode<*>.getTransaction(id: SecureHash): SignedTransaction? {
+        return database.transaction {
+            services.validatedTransactions.getTransaction(id)
+        }
+    }
+}
--- a/node/src/test/kotlin/net/corda/node/services/statemachine/FlowFrameworkTests.kt
+++ b/node/src/test/kotlin/net/corda/node/services/statemachine/FlowFrameworkTests.kt
@ -692,9 +692,6 @@ class FlowFrameworkPersistenceTests {
    fun `flow loaded from checkpoint will respond to messages from before start`() {
        aliceNode.registerFlowFactory(ReceiveFlow::class) { InitiatedSendFlow("Hello", it) }
        bobNode.services.startFlow(ReceiveFlow(alice).nonTerminating()) // Prepare checkpointed receive flow
-        // Make sure the add() has finished initial processing.
-        bobNode.internals.disableDBCloseOnStop()
-        bobNode.dispose() // kill receiver
        val restoredFlow = bobNode.restartAndGetRestoredFlow<ReceiveFlow>()
        assertThat(restoredFlow.receivedPayloads[0]).isEqualTo("Hello")
    }
@ -752,14 +749,11 @@ class FlowFrameworkPersistenceTests {
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //region Helpers

-    private inline fun <reified P : FlowLogic<*>> StartedNode<MockNode>.restartAndGetRestoredFlow() = internals.run {
-        disableDBCloseOnStop() // Handover DB to new node copy
-        stop()
-        val newNode = mockNet.createNode(InternalMockNodeParameters(id, configuration.myLegalName))
+    private inline fun <reified P : FlowLogic<*>> StartedNode<MockNode>.restartAndGetRestoredFlow(): P {
+        val newNode = mockNet.restartNode(this)
        newNode.internals.acceptableLiveFiberCountOnStop = 1
-        manuallyCloseDB()
        mockNet.runNetwork()
-        newNode.getSingleFlow<P>().first
+        return newNode.getSingleFlow<P>().first
    }

    private fun assertSessionTransfers(vararg expected: SessionTransfer) {
--- a/node/src/test/kotlin/net/corda/node/services/statemachine/RetryFlowMockTest.kt
+++ b/node/src/test/kotlin/net/corda/node/services/statemachine/RetryFlowMockTest.kt
@ -7,6 +7,8 @@ import net.corda.core.flows.FlowSession
 import net.corda.core.flows.InitiatedBy
 import net.corda.core.flows.InitiatingFlow
 import net.corda.core.identity.Party
+import net.corda.core.internal.concurrent.flatMap
+import net.corda.core.internal.packageName
 import net.corda.core.messaging.MessageRecipients
 import net.corda.core.utilities.getOrThrow
 import net.corda.core.utilities.unwrap
@ -15,10 +17,13 @@ import net.corda.node.services.messaging.Message
 import net.corda.node.services.persistence.DBTransactionStorage
 import net.corda.nodeapi.internal.persistence.contextTransaction
 import net.corda.testing.node.internal.InternalMockNetwork
+import net.corda.testing.node.internal.InternalMockNetwork.MockNode
 import net.corda.testing.node.internal.MessagingServiceSpy
 import net.corda.testing.node.internal.newContext
 import net.corda.testing.node.internal.setMessagingServiceSpy
-import org.assertj.core.api.Assertions
+import org.assertj.core.api.Assertions.assertThat
+import org.assertj.core.api.Assertions.assertThatThrownBy
+import org.hibernate.exception.ConstraintViolationException
 import org.junit.After
 import org.junit.Before
 import org.junit.Test
@ -30,21 +35,23 @@ import kotlin.test.assertNull

 class RetryFlowMockTest {
    private lateinit var mockNet: InternalMockNetwork
-    private lateinit var internalNodeA: StartedNode<InternalMockNetwork.MockNode>
-    private lateinit var internalNodeB: StartedNode<InternalMockNetwork.MockNode>
+    private lateinit var nodeA: StartedNode<MockNode>
+    private lateinit var nodeB: StartedNode<MockNode>

    @Before
    fun start() {
-        mockNet = InternalMockNetwork(threadPerNode = true, cordappPackages = listOf(this.javaClass.`package`.name))
-        internalNodeA = mockNet.createNode()
-        internalNodeB = mockNet.createNode()
+        mockNet = InternalMockNetwork(threadPerNode = true, cordappPackages = listOf(this.javaClass.packageName))
+        nodeA = mockNet.createNode()
+        nodeB = mockNet.createNode()
        mockNet.startNodes()
        RetryFlow.count = 0
        SendAndRetryFlow.count = 0
        RetryInsertFlow.count = 0
    }

-    private fun <T> StartedNode<InternalMockNetwork.MockNode>.startFlow(logic: FlowLogic<T>): CordaFuture<T> = this.services.startFlow(logic, this.services.newContext()).getOrThrow().resultFuture
+    private fun <T> StartedNode<MockNode>.startFlow(logic: FlowLogic<T>): CordaFuture<T> {
+        return this.services.startFlow(logic, this.services.newContext()).flatMap { it.resultFuture }
+    }

    @After
    fun cleanUp() {
@ -53,14 +60,14 @@ class RetryFlowMockTest {

    @Test
    fun `Single retry`() {
-        assertEquals(Unit, internalNodeA.startFlow(RetryFlow(1)).get())
+        assertEquals(Unit, nodeA.startFlow(RetryFlow(1)).get())
        assertEquals(2, RetryFlow.count)
    }

    @Test
    fun `Retry forever`() {
-        Assertions.assertThatThrownBy {
-            internalNodeA.startFlow(RetryFlow(Int.MAX_VALUE)).getOrThrow()
+        assertThatThrownBy {
+            nodeA.startFlow(RetryFlow(Int.MAX_VALUE)).getOrThrow()
        }.isInstanceOf(LimitedRetryCausingError::class.java)
        assertEquals(5, RetryFlow.count)
    }
@ -68,14 +75,14 @@ class RetryFlowMockTest {
    @Test
    fun `Retry does not set senderUUID`() {
        val messagesSent = mutableListOf<Message>()
-        val partyB = internalNodeB.info.legalIdentities.first()
-        internalNodeA.setMessagingServiceSpy(object : MessagingServiceSpy(internalNodeA.network) {
+        val partyB = nodeB.info.legalIdentities.first()
+        nodeA.setMessagingServiceSpy(object : MessagingServiceSpy(nodeA.network) {
            override fun send(message: Message, target: MessageRecipients, sequenceKey: Any) {
                messagesSent.add(message)
                messagingService.send(message, target)
            }
        })
-        internalNodeA.startFlow(SendAndRetryFlow(1, partyB)).get()
+        nodeA.startFlow(SendAndRetryFlow(1, partyB)).get()
        assertNotNull(messagesSent.first().senderUUID)
        assertNull(messagesSent.last().senderUUID)
        assertEquals(2, SendAndRetryFlow.count)
@ -83,26 +90,25 @@ class RetryFlowMockTest {

    @Test
    fun `Retry duplicate insert`() {
-        assertEquals(Unit, internalNodeA.startFlow(RetryInsertFlow(1)).get())
+        assertEquals(Unit, nodeA.startFlow(RetryInsertFlow(1)).get())
        assertEquals(2, RetryInsertFlow.count)
    }

    @Test
    fun `Patient records do not leak in hospital`() {
-        val patientCountBefore = StaffedFlowHospital.numberOfPatients
-        assertEquals(Unit, internalNodeA.startFlow(RetryFlow(1)).get())
+        assertEquals(Unit, nodeA.startFlow(RetryFlow(1)).get())
        // Need to make sure the state machine has finished.  Otherwise this test is flakey.
        mockNet.waitQuiescent()
-        assertEquals(patientCountBefore, StaffedFlowHospital.numberOfPatients)
+        assertThat(nodeA.smm.flowHospital.track().snapshot).isEmpty()
        assertEquals(2, RetryFlow.count)
    }
 }

-class LimitedRetryCausingError : org.hibernate.exception.ConstraintViolationException("Test message", SQLException(), "Test constraint")
+class LimitedRetryCausingError : ConstraintViolationException("Test message", SQLException(), "Test constraint")

 class RetryCausingError : SQLException("deadlock")

-class RetryFlow(val i: Int) : FlowLogic<Unit>() {
+class RetryFlow(private val i: Int) : FlowLogic<Unit>() {
    companion object {
        var count = 0
    }
@ -121,7 +127,7 @@ class RetryFlow(val i: Int) : FlowLogic<Unit>() {
 }

@InitiatingFlow
-class SendAndRetryFlow(val i: Int, val other: Party) : FlowLogic<Unit>() {
+class SendAndRetryFlow(private val i: Int, private val other: Party) : FlowLogic<Unit>() {
    companion object {
        var count = 0
    }
@ -137,8 +143,9 @@ class SendAndRetryFlow(val i: Int, val other: Party) : FlowLogic<Unit>() {
    }
 }

+@Suppress("unused")
@InitiatedBy(SendAndRetryFlow::class)
-class ReceiveFlow2(val other: FlowSession) : FlowLogic<Unit>() {
+class ReceiveFlow2(private val other: FlowSession) : FlowLogic<Unit>() {
    @Suspendable
    override fun call() {
        val received = other.receive<String>().unwrap { it }
@ -146,7 +153,7 @@ class ReceiveFlow2(val other: FlowSession) : FlowLogic<Unit>() {
    }
 }

-class RetryInsertFlow(val i: Int) : FlowLogic<Unit>() {
+class RetryInsertFlow(private val i: Int) : FlowLogic<Unit>() {
    companion object {
        var count = 0
    }
--- a/node/src/test/kotlin/net/corda/node/services/vault/VaultSoftLockManagerTest.kt
+++ b/node/src/test/kotlin/net/corda/node/services/vault/VaultSoftLockManagerTest.kt
@ -30,7 +30,6 @@ import net.corda.nodeapi.internal.persistence.HibernateConfiguration
 import net.corda.testing.core.singleIdentity
 import net.corda.testing.internal.rigorousMock
 import net.corda.testing.node.internal.InternalMockNetwork
-import net.corda.testing.node.internal.InternalMockNodeParameters
 import net.corda.testing.node.internal.startFlow
 import org.junit.After
 import org.junit.Test
@ -71,8 +70,7 @@ class NodePair(private val mockNet: InternalMockNetwork) {
        client.services.startFlow(clientLogic)
        while (!serverRunning.get()) mockNet.runNetwork(1)
        if (rebootClient) {
-            client.dispose()
-            client = mockNet.createNode(InternalMockNodeParameters(client.internals.id))
+            client = mockNet.restartNode(client)
        }
        return uncheckedCast(client.smm.allStateMachines.single().stateMachine)
    }
--- a/testing/node-driver/src/main/kotlin/net/corda/testing/node/internal/InternalMockNetwork.kt
+++ b/testing/node-driver/src/main/kotlin/net/corda/testing/node/internal/InternalMockNetwork.kt
@ -385,6 +385,17 @@ open class InternalMockNetwork(private val cordappPackages: List<String>,
        return node
    }

+    fun <N : MockNode> restartNode(node: StartedNode<N>, nodeFactory: (MockNodeArgs) -> N): StartedNode<N> {
+        node.internals.disableDBCloseOnStop()
+        node.dispose()
+        return createNode(
+                InternalMockNodeParameters(legalName = node.internals.configuration.myLegalName, forcedID = node.internals.id),
+                nodeFactory
+        )
+    }
+
+    fun restartNode(node: StartedNode<MockNode>): StartedNode<MockNode> = restartNode(node, defaultFactory)
+
    fun baseDirectory(nodeId: Int): Path = filesystem.getPath("/nodes/$nodeId")

    /**
@ -439,8 +450,7 @@ open class InternalMockNetwork(private val cordappPackages: List<String>,
        messagingNetwork.stop()
    }

-    // Test method to block until all scheduled activity, active flows
-    // and network activity has ceased.
+    /** Block until all scheduled activity, active flows and network activity has ceased. */
    fun waitQuiescent() {
        busyLatch.await()
    }