mirror of
https://github.com/corda/corda.git
synced 2025-06-06 01:11:45 +00:00
Merge pull request #6578 from corda/WillV/ENT-5395-Pause-and-Resume-Flows
ENT-5395 Pause and Resume Flows
This commit is contained in:
commit
f280ec9fd9
@ -23,6 +23,16 @@ interface CheckpointStorage {
|
|||||||
fun updateCheckpoint(id: StateMachineRunId, checkpoint: Checkpoint, serializedFlowState: SerializedBytes<FlowState>?,
|
fun updateCheckpoint(id: StateMachineRunId, checkpoint: Checkpoint, serializedFlowState: SerializedBytes<FlowState>?,
|
||||||
serializedCheckpointState: SerializedBytes<CheckpointState>)
|
serializedCheckpointState: SerializedBytes<CheckpointState>)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update an existing checkpoints status ([Checkpoint.status]).
|
||||||
|
*/
|
||||||
|
fun updateStatus(runId: StateMachineRunId, flowStatus: Checkpoint.FlowStatus)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update an existing checkpoints compatibility flag ([Checkpoint.compatible]).
|
||||||
|
*/
|
||||||
|
fun updateCompatible(runId: StateMachineRunId, compatible: Boolean)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Update all persisted checkpoints with status [Checkpoint.FlowStatus.RUNNABLE] or [Checkpoint.FlowStatus.HOSPITALIZED],
|
* Update all persisted checkpoints with status [Checkpoint.FlowStatus.RUNNABLE] or [Checkpoint.FlowStatus.HOSPITALIZED],
|
||||||
* changing the status to [Checkpoint.FlowStatus.PAUSED].
|
* changing the status to [Checkpoint.FlowStatus.PAUSED].
|
||||||
@ -65,6 +75,4 @@ interface CheckpointStorage {
|
|||||||
* This method does not fetch [Checkpoint.Serialized.serializedFlowState] to save memory.
|
* This method does not fetch [Checkpoint.Serialized.serializedFlowState] to save memory.
|
||||||
*/
|
*/
|
||||||
fun getPausedCheckpoints(): Stream<Pair<StateMachineRunId, Checkpoint.Serialized>>
|
fun getPausedCheckpoints(): Stream<Pair<StateMachineRunId, Checkpoint.Serialized>>
|
||||||
|
|
||||||
fun updateStatus(runId: StateMachineRunId, flowStatus: Checkpoint.FlowStatus)
|
|
||||||
}
|
}
|
||||||
|
@ -504,6 +504,11 @@ class DBCheckpointStorage(
|
|||||||
currentDBSession().createNativeQuery(update).executeUpdate()
|
currentDBSession().createNativeQuery(update).executeUpdate()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override fun updateCompatible(runId: StateMachineRunId, compatible: Boolean) {
|
||||||
|
val update = "Update ${NODE_DATABASE_PREFIX}checkpoints set compatible = $compatible where flow_id = '${runId.uuid}'"
|
||||||
|
currentDBSession().createNativeQuery(update).executeUpdate()
|
||||||
|
}
|
||||||
|
|
||||||
private fun createDBFlowMetadata(flowId: String, checkpoint: Checkpoint): DBFlowMetadata {
|
private fun createDBFlowMetadata(flowId: String, checkpoint: Checkpoint): DBFlowMetadata {
|
||||||
val context = checkpoint.checkpointState.invocationContext
|
val context = checkpoint.checkpointState.invocationContext
|
||||||
val flowInfo = checkpoint.checkpointState.subFlowStack.first()
|
val flowInfo = checkpoint.checkpointState.subFlowStack.first()
|
||||||
|
@ -57,6 +57,11 @@ sealed class Action {
|
|||||||
*/
|
*/
|
||||||
data class PersistCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint, val isCheckpointUpdate: Boolean) : Action()
|
data class PersistCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint, val isCheckpointUpdate: Boolean) : Action()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update only the [status] of the checkpoint with [id].
|
||||||
|
*/
|
||||||
|
data class UpdateFlowStatus(val id: StateMachineRunId, val status: Checkpoint.FlowStatus): Action()
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove the checkpoint corresponding to [id].
|
* Remove the checkpoint corresponding to [id].
|
||||||
*/
|
*/
|
||||||
@ -106,6 +111,11 @@ sealed class Action {
|
|||||||
val lastState: StateMachineState
|
val lastState: StateMachineState
|
||||||
) : Action()
|
) : Action()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the flow corresponding to [flowId] to paused.
|
||||||
|
*/
|
||||||
|
data class MoveFlowToPaused(val currentState: StateMachineState) : Action()
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Schedule [event] to self.
|
* Schedule [event] to self.
|
||||||
*/
|
*/
|
||||||
|
@ -67,6 +67,8 @@ internal class ActionExecutorImpl(
|
|||||||
is Action.RetryFlowFromSafePoint -> executeRetryFlowFromSafePoint(action)
|
is Action.RetryFlowFromSafePoint -> executeRetryFlowFromSafePoint(action)
|
||||||
is Action.ScheduleFlowTimeout -> scheduleFlowTimeout(action)
|
is Action.ScheduleFlowTimeout -> scheduleFlowTimeout(action)
|
||||||
is Action.CancelFlowTimeout -> cancelFlowTimeout(action)
|
is Action.CancelFlowTimeout -> cancelFlowTimeout(action)
|
||||||
|
is Action.MoveFlowToPaused -> executeMoveFlowToPaused(action)
|
||||||
|
is Action.UpdateFlowStatus -> executeUpdateFlowStatus(action)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private fun executeReleaseSoftLocks(action: Action.ReleaseSoftLocks) {
|
private fun executeReleaseSoftLocks(action: Action.ReleaseSoftLocks) {
|
||||||
@ -99,6 +101,11 @@ internal class ActionExecutorImpl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Suspendable
|
||||||
|
private fun executeUpdateFlowStatus(action: Action.UpdateFlowStatus) {
|
||||||
|
checkpointStorage.updateStatus(action.id, action.status)
|
||||||
|
}
|
||||||
|
|
||||||
@Suspendable
|
@Suspendable
|
||||||
private fun executePersistDeduplicationIds(action: Action.PersistDeduplicationFacts) {
|
private fun executePersistDeduplicationIds(action: Action.PersistDeduplicationFacts) {
|
||||||
for (handle in action.deduplicationHandlers) {
|
for (handle in action.deduplicationHandlers) {
|
||||||
@ -191,6 +198,11 @@ internal class ActionExecutorImpl(
|
|||||||
stateMachineManager.removeFlow(action.flowId, action.removalReason, action.lastState)
|
stateMachineManager.removeFlow(action.flowId, action.removalReason, action.lastState)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Suspendable
|
||||||
|
private fun executeMoveFlowToPaused(action: Action.MoveFlowToPaused) {
|
||||||
|
stateMachineManager.moveFlowToPaused(action.currentState)
|
||||||
|
}
|
||||||
|
|
||||||
@Suspendable
|
@Suspendable
|
||||||
@Throws(SQLException::class)
|
@Throws(SQLException::class)
|
||||||
private fun executeCreateTransaction() {
|
private fun executeCreateTransaction() {
|
||||||
|
@ -179,6 +179,13 @@ sealed class Event {
|
|||||||
override fun toString() = "WakeUpSleepyFlow"
|
override fun toString() = "WakeUpSleepyFlow"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pause the flow.
|
||||||
|
*/
|
||||||
|
object Pause: Event() {
|
||||||
|
override fun toString() = "Pause"
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Terminate the specified [sessions], removing them from in-memory datastructures.
|
* Terminate the specified [sessions], removing them from in-memory datastructures.
|
||||||
*
|
*
|
||||||
|
@ -19,7 +19,6 @@ import net.corda.core.utilities.contextLogger
|
|||||||
import net.corda.node.services.api.CheckpointStorage
|
import net.corda.node.services.api.CheckpointStorage
|
||||||
import net.corda.node.services.api.ServiceHubInternal
|
import net.corda.node.services.api.ServiceHubInternal
|
||||||
import net.corda.node.services.messaging.DeduplicationHandler
|
import net.corda.node.services.messaging.DeduplicationHandler
|
||||||
import net.corda.node.services.statemachine.FlowStateMachineImpl.Companion.currentStateMachine
|
|
||||||
import net.corda.node.services.statemachine.transitions.StateMachine
|
import net.corda.node.services.statemachine.transitions.StateMachine
|
||||||
import net.corda.node.utilities.isEnabledTimedFlow
|
import net.corda.node.utilities.isEnabledTimedFlow
|
||||||
import net.corda.nodeapi.internal.persistence.CordaPersistence
|
import net.corda.nodeapi.internal.persistence.CordaPersistence
|
||||||
@ -29,11 +28,16 @@ import java.util.concurrent.Semaphore
|
|||||||
|
|
||||||
class Flow<A>(val fiber: FlowStateMachineImpl<A>, val resultFuture: OpenFuture<Any?>)
|
class Flow<A>(val fiber: FlowStateMachineImpl<A>, val resultFuture: OpenFuture<Any?>)
|
||||||
|
|
||||||
class NonResidentFlow(val runId: StateMachineRunId, val checkpoint: Checkpoint) {
|
data class NonResidentFlow(
|
||||||
val externalEvents = mutableListOf<Event.DeliverSessionMessage>()
|
val runId: StateMachineRunId,
|
||||||
|
var checkpoint: Checkpoint,
|
||||||
|
val resultFuture: OpenFuture<Any?> = openFuture(),
|
||||||
|
val resumable: Boolean = true
|
||||||
|
) {
|
||||||
|
val events = mutableListOf<ExternalEvent>()
|
||||||
|
|
||||||
fun addExternalEvent(message: Event.DeliverSessionMessage) {
|
fun addExternalEvent(message: ExternalEvent) {
|
||||||
externalEvents.add(message)
|
events.add(message)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,18 +70,29 @@ class FlowCreator(
|
|||||||
}
|
}
|
||||||
else -> nonResidentFlow.checkpoint
|
else -> nonResidentFlow.checkpoint
|
||||||
}
|
}
|
||||||
return createFlowFromCheckpoint(nonResidentFlow.runId, checkpoint)
|
return createFlowFromCheckpoint(nonResidentFlow.runId, checkpoint, resultFuture = nonResidentFlow.resultFuture)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Suppress("LongParameterList")
|
||||||
fun createFlowFromCheckpoint(
|
fun createFlowFromCheckpoint(
|
||||||
runId: StateMachineRunId,
|
runId: StateMachineRunId,
|
||||||
oldCheckpoint: Checkpoint,
|
oldCheckpoint: Checkpoint,
|
||||||
reloadCheckpointAfterSuspendCount: Int? = null,
|
reloadCheckpointAfterSuspendCount: Int? = null,
|
||||||
lock: Semaphore = Semaphore(1)
|
lock: Semaphore = Semaphore(1),
|
||||||
|
resultFuture: OpenFuture<Any?> = openFuture(),
|
||||||
|
firstRestore: Boolean = true
|
||||||
): Flow<*>? {
|
): Flow<*>? {
|
||||||
val checkpoint = oldCheckpoint.copy(status = Checkpoint.FlowStatus.RUNNABLE)
|
val fiber = oldCheckpoint.getFiberFromCheckpoint(runId, firstRestore)
|
||||||
val fiber = checkpoint.getFiberFromCheckpoint(runId) ?: return null
|
var checkpoint = oldCheckpoint
|
||||||
val resultFuture = openFuture<Any?>()
|
if (fiber == null) {
|
||||||
|
updateCompatibleInDb(runId, false)
|
||||||
|
return null
|
||||||
|
} else if (!oldCheckpoint.compatible) {
|
||||||
|
updateCompatibleInDb(runId, true)
|
||||||
|
checkpoint = checkpoint.copy(compatible = true)
|
||||||
|
}
|
||||||
|
checkpoint = checkpoint.copy(status = Checkpoint.FlowStatus.RUNNABLE)
|
||||||
|
|
||||||
fiber.logic.stateMachine = fiber
|
fiber.logic.stateMachine = fiber
|
||||||
verifyFlowLogicIsSuspendable(fiber.logic)
|
verifyFlowLogicIsSuspendable(fiber.logic)
|
||||||
fiber.transientValues = createTransientValues(runId, resultFuture)
|
fiber.transientValues = createTransientValues(runId, resultFuture)
|
||||||
@ -92,6 +107,12 @@ class FlowCreator(
|
|||||||
return Flow(fiber, resultFuture)
|
return Flow(fiber, resultFuture)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun updateCompatibleInDb(runId: StateMachineRunId, compatible: Boolean) {
|
||||||
|
database.transaction {
|
||||||
|
checkpointStorage.updateCompatible(runId, compatible)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Suppress("LongParameterList")
|
@Suppress("LongParameterList")
|
||||||
fun <A> createFlowFromLogic(
|
fun <A> createFlowFromLogic(
|
||||||
flowId: StateMachineRunId,
|
flowId: StateMachineRunId,
|
||||||
@ -135,36 +156,45 @@ class FlowCreator(
|
|||||||
return Flow(flowStateMachineImpl, resultFuture)
|
return Flow(flowStateMachineImpl, resultFuture)
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun Checkpoint.getFiberFromCheckpoint(runId: StateMachineRunId): FlowStateMachineImpl<*>? {
|
@Suppress("TooGenericExceptionCaught")
|
||||||
return when (this.flowState) {
|
private fun Checkpoint.getFiberFromCheckpoint(runId: StateMachineRunId, firstRestore: Boolean): FlowStateMachineImpl<*>? {
|
||||||
|
try {
|
||||||
|
return when(flowState) {
|
||||||
is FlowState.Unstarted -> {
|
is FlowState.Unstarted -> {
|
||||||
val logic = tryCheckpointDeserialize(this.flowState.frozenFlowLogic, runId) ?: return null
|
val logic = deserializeFlowState(flowState.frozenFlowLogic)
|
||||||
FlowStateMachineImpl(runId, logic, scheduler)
|
FlowStateMachineImpl(runId, logic, scheduler)
|
||||||
}
|
}
|
||||||
is FlowState.Started -> tryCheckpointDeserialize(this.flowState.frozenFiber, runId) ?: return null
|
is FlowState.Started -> deserializeFlowState(flowState.frozenFiber)
|
||||||
// Places calling this function is rely on it to return null if the flow cannot be created from the checkpoint.
|
// Places calling this function is rely on it to return null if the flow cannot be created from the checkpoint.
|
||||||
else -> null
|
else -> return null
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
@Suppress("TooGenericExceptionCaught")
|
|
||||||
private inline fun <reified T : Any> tryCheckpointDeserialize(bytes: SerializedBytes<T>, flowId: StateMachineRunId): T? {
|
|
||||||
return try {
|
|
||||||
bytes.checkpointDeserialize(context = checkpointSerializationContext)
|
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
if (reloadCheckpointAfterSuspend && currentStateMachine() != null) {
|
if (reloadCheckpointAfterSuspend && FlowStateMachineImpl.currentStateMachine() != null) {
|
||||||
logger.error(
|
logger.error(
|
||||||
"Unable to deserialize checkpoint for flow $flowId. [reloadCheckpointAfterSuspend] is turned on, throwing exception",
|
"Unable to deserialize checkpoint for flow $runId. [reloadCheckpointAfterSuspend] is turned on, throwing exception",
|
||||||
e
|
e
|
||||||
)
|
)
|
||||||
throw ReloadFlowFromCheckpointException(e)
|
throw ReloadFlowFromCheckpointException(e)
|
||||||
} else {
|
} else {
|
||||||
logger.error("Unable to deserialize checkpoint for flow $flowId. Something is very wrong and this flow will be ignored.", e)
|
logSerializationError(firstRestore, runId, e)
|
||||||
null
|
return null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private inline fun <reified T : Any> deserializeFlowState(bytes: SerializedBytes<T>): T {
|
||||||
|
return bytes.checkpointDeserialize(context = checkpointSerializationContext)
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun logSerializationError(firstRestore: Boolean, flowId: StateMachineRunId, exception: Exception) {
|
||||||
|
if (firstRestore) {
|
||||||
|
logger.warn("Flow with id $flowId could not be restored from its checkpoint. Normally this means that a CorDapp has been" +
|
||||||
|
" upgraded without draining the node. To run this flow restart the node after downgrading the CorDapp.", exception)
|
||||||
|
} else {
|
||||||
|
logger.error("Unable to deserialize fiber for flow $flowId. Something is very wrong and this flow will be ignored.", exception)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private fun verifyFlowLogicIsSuspendable(logic: FlowLogic<Any?>) {
|
private fun verifyFlowLogicIsSuspendable(logic: FlowLogic<Any?>) {
|
||||||
// Quasar requires (in Java 8) that at least the call method be annotated suspendable. Unfortunately, it's
|
// Quasar requires (in Java 8) that at least the call method be annotated suspendable. Unfortunately, it's
|
||||||
// easy to forget to add this when creating a new flow, so we check here to give the user a better error.
|
// easy to forget to add this when creating a new flow, so we check here to give the user a better error.
|
||||||
|
@ -3,6 +3,7 @@ package net.corda.node.services.statemachine
|
|||||||
import co.paralleluniverse.fibers.Fiber
|
import co.paralleluniverse.fibers.Fiber
|
||||||
import co.paralleluniverse.fibers.FiberExecutorScheduler
|
import co.paralleluniverse.fibers.FiberExecutorScheduler
|
||||||
import co.paralleluniverse.fibers.instrument.JavaAgent
|
import co.paralleluniverse.fibers.instrument.JavaAgent
|
||||||
|
import co.paralleluniverse.strands.channels.Channel
|
||||||
import com.codahale.metrics.Gauge
|
import com.codahale.metrics.Gauge
|
||||||
import com.google.common.util.concurrent.ThreadFactoryBuilder
|
import com.google.common.util.concurrent.ThreadFactoryBuilder
|
||||||
import net.corda.core.concurrent.CordaFuture
|
import net.corda.core.concurrent.CordaFuture
|
||||||
@ -18,7 +19,6 @@ import net.corda.core.internal.castIfPossible
|
|||||||
import net.corda.core.internal.concurrent.map
|
import net.corda.core.internal.concurrent.map
|
||||||
import net.corda.core.internal.concurrent.mapError
|
import net.corda.core.internal.concurrent.mapError
|
||||||
import net.corda.core.internal.concurrent.openFuture
|
import net.corda.core.internal.concurrent.openFuture
|
||||||
import net.corda.core.internal.mapNotNull
|
|
||||||
import net.corda.core.internal.uncheckedCast
|
import net.corda.core.internal.uncheckedCast
|
||||||
import net.corda.core.messaging.DataFeed
|
import net.corda.core.messaging.DataFeed
|
||||||
import net.corda.core.serialization.deserialize
|
import net.corda.core.serialization.deserialize
|
||||||
@ -55,7 +55,6 @@ import javax.annotation.concurrent.ThreadSafe
|
|||||||
import kotlin.collections.component1
|
import kotlin.collections.component1
|
||||||
import kotlin.collections.component2
|
import kotlin.collections.component2
|
||||||
import kotlin.collections.set
|
import kotlin.collections.set
|
||||||
import kotlin.streams.toList
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The StateMachineManagerImpl will always invoke the flow fibers on the given [AffinityExecutor], regardless of which
|
* The StateMachineManagerImpl will always invoke the flow fibers on the given [AffinityExecutor], regardless of which
|
||||||
@ -98,7 +97,7 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
private val flowTimeoutScheduler = FlowTimeoutScheduler(innerState, scheduledFutureExecutor, serviceHub)
|
private val flowTimeoutScheduler = FlowTimeoutScheduler(innerState, scheduledFutureExecutor, serviceHub)
|
||||||
private val ourSenderUUID = serviceHub.networkService.ourSenderUUID
|
private val ourSenderUUID = serviceHub.networkService.ourSenderUUID
|
||||||
|
|
||||||
private var checkpointSerializationContext: CheckpointSerializationContext? = null
|
private lateinit var checkpointSerializationContext: CheckpointSerializationContext
|
||||||
private lateinit var flowCreator: FlowCreator
|
private lateinit var flowCreator: FlowCreator
|
||||||
|
|
||||||
override val flowHospital: StaffedFlowHospital = makeFlowHospital()
|
override val flowHospital: StaffedFlowHospital = makeFlowHospital()
|
||||||
@ -168,12 +167,11 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
flowTimeoutScheduler::resetCustomTimeout
|
flowTimeoutScheduler::resetCustomTimeout
|
||||||
)
|
)
|
||||||
|
|
||||||
val fibers = restoreFlowsFromCheckpoints()
|
val (fibers, pausedFlows) = restoreFlowsFromCheckpoints()
|
||||||
metrics.register("Flows.InFlight", Gauge<Int> { innerState.flows.size })
|
metrics.register("Flows.InFlight", Gauge<Int> { innerState.flows.size })
|
||||||
|
|
||||||
setFlowDefaultUncaughtExceptionHandler()
|
setFlowDefaultUncaughtExceptionHandler()
|
||||||
|
|
||||||
val pausedFlows = restoreNonResidentFlowsFromPausedCheckpoints()
|
|
||||||
innerState.withLock {
|
innerState.withLock {
|
||||||
this.pausedFlows.putAll(pausedFlows)
|
this.pausedFlows.putAll(pausedFlows)
|
||||||
for ((id, flow) in pausedFlows) {
|
for ((id, flow) in pausedFlows) {
|
||||||
@ -366,30 +364,31 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
liveFibers.countUp()
|
liveFibers.countUp()
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun restoreFlowsFromCheckpoints(): List<Flow<*>> {
|
private fun restoreFlowsFromCheckpoints(): Pair<MutableMap<StateMachineRunId, Flow<*>>, MutableMap<StateMachineRunId, NonResidentFlow>> {
|
||||||
return checkpointStorage.getCheckpointsToRun().use {
|
val flows = mutableMapOf<StateMachineRunId, Flow<*>>()
|
||||||
it.mapNotNull { (id, serializedCheckpoint) ->
|
val pausedFlows = mutableMapOf<StateMachineRunId, NonResidentFlow>()
|
||||||
|
checkpointStorage.getCheckpointsToRun().forEach Checkpoints@{(id, serializedCheckpoint) ->
|
||||||
// If a flow is added before start() then don't attempt to restore it
|
// If a flow is added before start() then don't attempt to restore it
|
||||||
innerState.withLock { if (id in flows) return@mapNotNull null }
|
innerState.withLock { if (id in flows) return@Checkpoints }
|
||||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@mapNotNull null
|
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@Checkpoints
|
||||||
flowCreator.createFlowFromCheckpoint(id, checkpoint)
|
val flow = flowCreator.createFlowFromCheckpoint(id, checkpoint)
|
||||||
}.toList()
|
if (flow == null) {
|
||||||
|
// Set the flowState to paused so we don't waste memory storing it anymore.
|
||||||
|
pausedFlows[id] = NonResidentFlow(id, checkpoint.copy(flowState = FlowState.Paused), resumable = false)
|
||||||
|
} else {
|
||||||
|
flows[id] = flow
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
checkpointStorage.getPausedCheckpoints().forEach Checkpoints@{ (id, serializedCheckpoint) ->
|
||||||
|
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@Checkpoints
|
||||||
|
pausedFlows[id] = NonResidentFlow(id, checkpoint)
|
||||||
|
}
|
||||||
|
return Pair(flows, pausedFlows)
|
||||||
|
}
|
||||||
|
|
||||||
private fun restoreNonResidentFlowsFromPausedCheckpoints(): Map<StateMachineRunId, NonResidentFlow> {
|
private fun resumeRestoredFlows(flows: Map<StateMachineRunId, Flow<*>>) {
|
||||||
return checkpointStorage.getPausedCheckpoints().use {
|
for ((id, flow) in flows.entries) {
|
||||||
it.mapNotNull { (id, serializedCheckpoint) ->
|
addAndStartFlow(id, flow)
|
||||||
// If a flow is added before start() then don't attempt to restore it
|
|
||||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@mapNotNull null
|
|
||||||
id to NonResidentFlow(id, checkpoint)
|
|
||||||
}.toList().toMap()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun resumeRestoredFlows(flows: List<Flow<*>>) {
|
|
||||||
for (flow in flows) {
|
|
||||||
addAndStartFlow(flow.fiber.id, flow)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -406,19 +405,27 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
val flow = if (currentState.isAnyCheckpointPersisted) {
|
val flow = if (currentState.isAnyCheckpointPersisted) {
|
||||||
// We intentionally grab the checkpoint from storage rather than relying on the one referenced by currentState. This is so that
|
// We intentionally grab the checkpoint from storage rather than relying on the one referenced by currentState. This is so that
|
||||||
// we mirror exactly what happens when restarting the node.
|
// we mirror exactly what happens when restarting the node.
|
||||||
val serializedCheckpoint = database.transaction { checkpointStorage.getCheckpoint(flowId) }
|
val checkpoint = database.transaction {
|
||||||
|
val serializedCheckpoint = checkpointStorage.getCheckpoint(flowId)
|
||||||
if (serializedCheckpoint == null) {
|
if (serializedCheckpoint == null) {
|
||||||
logger.error("Unable to find database checkpoint for flow $flowId. Something is very wrong. The flow will not retry.")
|
logger.error("Unable to find database checkpoint for flow $flowId. Something is very wrong. The flow will not retry.")
|
||||||
return
|
return@transaction null
|
||||||
}
|
}
|
||||||
|
|
||||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, flowId) ?: return
|
tryDeserializeCheckpoint(serializedCheckpoint, flowId)?.also {
|
||||||
|
if (it.status == Checkpoint.FlowStatus.HOSPITALIZED) {
|
||||||
|
checkpointStorage.updateStatus(flowId, Checkpoint.FlowStatus.RUNNABLE)
|
||||||
|
}
|
||||||
|
} ?: return@transaction null
|
||||||
|
} ?: return
|
||||||
|
|
||||||
// Resurrect flow
|
// Resurrect flow
|
||||||
flowCreator.createFlowFromCheckpoint(
|
flowCreator.createFlowFromCheckpoint(
|
||||||
flowId,
|
flowId,
|
||||||
checkpoint,
|
checkpoint,
|
||||||
currentState.reloadCheckpointAfterSuspendCount,
|
currentState.reloadCheckpointAfterSuspendCount,
|
||||||
currentState.lock
|
currentState.lock,
|
||||||
|
firstRestore = false
|
||||||
) ?: return
|
) ?: return
|
||||||
} else {
|
} else {
|
||||||
// Just flow initiation message
|
// Just flow initiation message
|
||||||
@ -436,17 +443,56 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
injectOldProgressTracker(currentState.flowLogic.progressTracker, flow.fiber.logic)
|
injectOldProgressTracker(currentState.flowLogic.progressTracker, flow.fiber.logic)
|
||||||
addAndStartFlow(flowId, flow)
|
addAndStartFlow(flowId, flow)
|
||||||
}
|
}
|
||||||
// Deliver all the external events from the old flow instance.
|
extractAndScheduleEventsForRetry(oldFlowLeftOver, currentState)
|
||||||
val unprocessedExternalEvents = mutableListOf<ExternalEvent>()
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all the [ExternalEvent] from this flows event queue and queue them (in the correct order) in the PausedFlow.
|
||||||
|
* This differs from [extractAndScheduleEventsForRetry] which also extracts (and schedules) [Event.Pause]. This means that if there are
|
||||||
|
* more events in the flows eventQueue then the flow won't pause again (after it is retried). These events are then scheduled (along
|
||||||
|
* with any [ExistingSessionMessage] which arrive in the interim) when the flow is retried.
|
||||||
|
*/
|
||||||
|
private fun extractAndQueueExternalEventsForPausedFlow(
|
||||||
|
currentEventQueue: Channel<Event>,
|
||||||
|
currentPendingDeduplicationHandlers: List<DeduplicationHandler>,
|
||||||
|
pausedFlow: NonResidentFlow
|
||||||
|
) {
|
||||||
|
pausedFlow.events += currentPendingDeduplicationHandlers.map{it.externalCause}
|
||||||
do {
|
do {
|
||||||
val event = oldFlowLeftOver.tryReceive()
|
val event = currentEventQueue.tryReceive()
|
||||||
if (event is Event.GeneratedByExternalEvent) {
|
if (event is Event.GeneratedByExternalEvent) {
|
||||||
unprocessedExternalEvents += event.deduplicationHandler.externalCause
|
pausedFlow.events.add(event.deduplicationHandler.externalCause)
|
||||||
}
|
}
|
||||||
} while (event != null)
|
} while (event != null)
|
||||||
val externalEvents = currentState.pendingDeduplicationHandlers.map { it.externalCause } + unprocessedExternalEvents
|
}
|
||||||
for (externalEvent in externalEvents) {
|
|
||||||
deliverExternalEvent(externalEvent)
|
|
||||||
|
/**
|
||||||
|
* Extract all the incomplete deduplication handlers as well as the [ExternalEvent] and [Event.Pause] events from this flows event queue
|
||||||
|
* [oldEventQueue]. Then schedule them (in the same order) for the new flow. This means that if a retried flow has a pause event
|
||||||
|
* scheduled then the retried flow will eventually pause. The new flow will not retry again if future retry events have been scheduled.
|
||||||
|
* When this method is called this flow must have been replaced by the new flow in [StateMachineInnerState.flows]. This method differs
|
||||||
|
* from [extractAndQueueExternalEventsForPausedFlow] where (only) [externalEvents] are extracted and scheduled straight away.
|
||||||
|
*/
|
||||||
|
private fun extractAndScheduleEventsForRetry(oldEventQueue: Channel<Event>, currentState: StateMachineState) {
|
||||||
|
val flow = innerState.withLock {
|
||||||
|
flows[currentState.flowLogic.runId]
|
||||||
|
}
|
||||||
|
val events = mutableListOf<Event>()
|
||||||
|
do {
|
||||||
|
val event = oldEventQueue.tryReceive()
|
||||||
|
if (event is Event.Pause || event is Event.GeneratedByExternalEvent) events.add(event)
|
||||||
|
} while (event != null)
|
||||||
|
|
||||||
|
for (externalEvent in currentState.pendingDeduplicationHandlers) {
|
||||||
|
deliverExternalEvent(externalEvent.externalCause)
|
||||||
|
}
|
||||||
|
for (event in events) {
|
||||||
|
if (event is Event.GeneratedByExternalEvent) {
|
||||||
|
deliverExternalEvent(event.deduplicationHandler.externalCause)
|
||||||
|
} else {
|
||||||
|
flow?.fiber?.scheduleEvent(event)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -485,7 +531,7 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
val sender = serviceHub.networkMapCache.getPeerByLegalName(peer)
|
val sender = serviceHub.networkMapCache.getPeerByLegalName(peer)
|
||||||
if (sender != null) {
|
if (sender != null) {
|
||||||
when (sessionMessage) {
|
when (sessionMessage) {
|
||||||
is ExistingSessionMessage -> onExistingSessionMessage(sessionMessage, event.deduplicationHandler, sender)
|
is ExistingSessionMessage -> onExistingSessionMessage(sessionMessage, sender, event)
|
||||||
is InitialSessionMessage -> onSessionInit(sessionMessage, sender, event)
|
is InitialSessionMessage -> onSessionInit(sessionMessage, sender, event)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -495,8 +541,13 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun onExistingSessionMessage(sessionMessage: ExistingSessionMessage, deduplicationHandler: DeduplicationHandler, sender: Party) {
|
private fun onExistingSessionMessage(
|
||||||
|
sessionMessage: ExistingSessionMessage,
|
||||||
|
sender: Party,
|
||||||
|
externalEvent: ExternalEvent.ExternalMessageEvent
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
|
val deduplicationHandler = externalEvent.deduplicationHandler
|
||||||
val recipientId = sessionMessage.recipientSessionId
|
val recipientId = sessionMessage.recipientSessionId
|
||||||
val flowId = sessionToFlow[recipientId]
|
val flowId = sessionToFlow[recipientId]
|
||||||
if (flowId == null) {
|
if (flowId == null) {
|
||||||
@ -515,7 +566,7 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
innerState.withLock {
|
innerState.withLock {
|
||||||
flows[flowId]?.run { fiber.scheduleEvent(event) }
|
flows[flowId]?.run { fiber.scheduleEvent(event) }
|
||||||
// If flow is not running add it to the list of external events to be processed if/when the flow resumes.
|
// If flow is not running add it to the list of external events to be processed if/when the flow resumes.
|
||||||
?: pausedFlows[flowId]?.run { addExternalEvent(event) }
|
?: pausedFlows[flowId]?.run { addExternalEvent(externalEvent) }
|
||||||
?: logger.info("Cannot find fiber corresponding to flow ID $flowId")
|
?: logger.info("Cannot find fiber corresponding to flow ID $flowId")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -632,7 +683,16 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
val flow = flowCreator.createFlowFromLogic(flowId, invocationContext, flowLogic, flowStart, ourIdentity, existingCheckpoint, deduplicationHandler, ourSenderUUID)
|
val flow = flowCreator.createFlowFromLogic(
|
||||||
|
flowId,
|
||||||
|
invocationContext,
|
||||||
|
flowLogic,
|
||||||
|
flowStart,
|
||||||
|
ourIdentity,
|
||||||
|
existingCheckpoint,
|
||||||
|
deduplicationHandler,
|
||||||
|
ourSenderUUID
|
||||||
|
)
|
||||||
val startedFuture = openFuture<Unit>()
|
val startedFuture = openFuture<Unit>()
|
||||||
innerState.withLock {
|
innerState.withLock {
|
||||||
startedFutures[flowId] = startedFuture
|
startedFutures[flowId] = startedFuture
|
||||||
@ -650,9 +710,29 @@ internal class SingleThreadedStateMachineManager(
|
|||||||
flowTimeoutScheduler.cancel(flowId)
|
flowTimeoutScheduler.cancel(flowId)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override fun moveFlowToPaused(currentState: StateMachineState) {
|
||||||
|
currentState.cancelFutureIfRunning()
|
||||||
|
flowTimeoutScheduler.cancel(currentState.flowLogic.runId)
|
||||||
|
innerState.withLock {
|
||||||
|
val id = currentState.flowLogic.runId
|
||||||
|
val flow = flows.remove(id)
|
||||||
|
if (flow != null) {
|
||||||
|
decrementLiveFibers()
|
||||||
|
//Setting flowState = FlowState.Paused means we don't hold the frozen fiber in memory.
|
||||||
|
val checkpoint = currentState.checkpoint.copy(status = Checkpoint.FlowStatus.PAUSED, flowState = FlowState.Paused)
|
||||||
|
val pausedFlow = NonResidentFlow(id, checkpoint, flow.resultFuture)
|
||||||
|
val eventQueue = flow.fiber.transientValues.eventQueue
|
||||||
|
extractAndQueueExternalEventsForPausedFlow(eventQueue, currentState.pendingDeduplicationHandlers, pausedFlow)
|
||||||
|
pausedFlows.put(id, pausedFlow)
|
||||||
|
} else {
|
||||||
|
logger.warn("Flow $id already removed before pausing")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private fun tryDeserializeCheckpoint(serializedCheckpoint: Checkpoint.Serialized, flowId: StateMachineRunId): Checkpoint? {
|
private fun tryDeserializeCheckpoint(serializedCheckpoint: Checkpoint.Serialized, flowId: StateMachineRunId): Checkpoint? {
|
||||||
return try {
|
return try {
|
||||||
serializedCheckpoint.deserialize(checkpointSerializationContext!!)
|
serializedCheckpoint.deserialize(checkpointSerializationContext)
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
if (reloadCheckpointAfterSuspend && currentStateMachine() != null) {
|
if (reloadCheckpointAfterSuspend && currentStateMachine() != null) {
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -104,6 +104,16 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging,
|
|||||||
*/
|
*/
|
||||||
private val flowsInHospital = ConcurrentHashMap<StateMachineRunId, FlowFiber>()
|
private val flowsInHospital = ConcurrentHashMap<StateMachineRunId, FlowFiber>()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the flow is currently being treated in the hospital.
|
||||||
|
* The differs to flows with a medical history (which can accessed via [StaffedFlowHospital.contains]).
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
internal fun flowInHospital(runId: StateMachineRunId): Boolean {
|
||||||
|
// The .keys avoids https://youtrack.jetbrains.com/issue/KT-18053
|
||||||
|
return runId in flowsInHospital.keys
|
||||||
|
}
|
||||||
|
|
||||||
private val mutex = ThreadBox(object {
|
private val mutex = ThreadBox(object {
|
||||||
/**
|
/**
|
||||||
* Contains medical history of every flow (a patient) that has entered the hospital. A flow can leave the hospital,
|
* Contains medical history of every flow (a patient) that has entered the hospital. A flow can leave the hospital,
|
||||||
|
@ -106,6 +106,7 @@ internal interface StateMachineManagerInternal {
|
|||||||
fun addSessionBinding(flowId: StateMachineRunId, sessionId: SessionId)
|
fun addSessionBinding(flowId: StateMachineRunId, sessionId: SessionId)
|
||||||
fun removeSessionBindings(sessionIds: Set<SessionId>)
|
fun removeSessionBindings(sessionIds: Set<SessionId>)
|
||||||
fun removeFlow(flowId: StateMachineRunId, removalReason: FlowRemovalReason, lastState: StateMachineState)
|
fun removeFlow(flowId: StateMachineRunId, removalReason: FlowRemovalReason, lastState: StateMachineState)
|
||||||
|
fun moveFlowToPaused(currentState: StateMachineState)
|
||||||
fun retryFlowFromSafePoint(currentState: StateMachineState)
|
fun retryFlowFromSafePoint(currentState: StateMachineState)
|
||||||
fun scheduleFlowTimeout(flowId: StateMachineRunId)
|
fun scheduleFlowTimeout(flowId: StateMachineRunId)
|
||||||
fun cancelFlowTimeout(flowId: StateMachineRunId)
|
fun cancelFlowTimeout(flowId: StateMachineRunId)
|
||||||
|
@ -62,6 +62,7 @@ class TopLevelTransition(
|
|||||||
is Event.ReloadFlowFromCheckpointAfterSuspend -> reloadFlowFromCheckpointAfterSuspendTransition()
|
is Event.ReloadFlowFromCheckpointAfterSuspend -> reloadFlowFromCheckpointAfterSuspendTransition()
|
||||||
is Event.OvernightObservation -> overnightObservationTransition()
|
is Event.OvernightObservation -> overnightObservationTransition()
|
||||||
is Event.WakeUpFromSleep -> wakeUpFromSleepTransition()
|
is Event.WakeUpFromSleep -> wakeUpFromSleepTransition()
|
||||||
|
is Event.Pause -> pausedFlowTransition()
|
||||||
is Event.TerminateSessions -> terminateSessionsTransition(event)
|
is Event.TerminateSessions -> terminateSessionsTransition(event)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -368,6 +369,22 @@ class TopLevelTransition(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun pausedFlowTransition(): TransitionResult {
|
||||||
|
return builder {
|
||||||
|
if (!startingState.isFlowResumed) {
|
||||||
|
actions.add(Action.CreateTransaction)
|
||||||
|
}
|
||||||
|
actions.addAll(
|
||||||
|
arrayOf(
|
||||||
|
Action.UpdateFlowStatus(context.id, Checkpoint.FlowStatus.PAUSED),
|
||||||
|
Action.CommitTransaction,
|
||||||
|
Action.MoveFlowToPaused(currentState)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
FlowContinuation.Abort
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private fun terminateSessionsTransition(event: Event.TerminateSessions): TransitionResult {
|
private fun terminateSessionsTransition(event: Event.TerminateSessions): TransitionResult {
|
||||||
return builder {
|
return builder {
|
||||||
val sessions = event.sessions
|
val sessions = event.sessions
|
||||||
|
@ -799,8 +799,8 @@ class DBCheckpointStorageTests {
|
|||||||
val (extractedId, extractedCheckpoint) = checkpointStorage.getPausedCheckpoints().toList().single()
|
val (extractedId, extractedCheckpoint) = checkpointStorage.getPausedCheckpoints().toList().single()
|
||||||
assertEquals(id, extractedId)
|
assertEquals(id, extractedId)
|
||||||
//We don't extract the result or the flowstate from a paused checkpoint
|
//We don't extract the result or the flowstate from a paused checkpoint
|
||||||
assertEquals(null, extractedCheckpoint.serializedFlowState)
|
assertNull(extractedCheckpoint.serializedFlowState)
|
||||||
assertEquals(null, extractedCheckpoint.result)
|
assertNull(extractedCheckpoint.result)
|
||||||
|
|
||||||
assertEquals(pausedCheckpoint.status, extractedCheckpoint.status)
|
assertEquals(pausedCheckpoint.status, extractedCheckpoint.status)
|
||||||
assertEquals(pausedCheckpoint.progressStep, extractedCheckpoint.progressStep)
|
assertEquals(pausedCheckpoint.progressStep, extractedCheckpoint.progressStep)
|
||||||
@ -890,6 +890,24 @@ class DBCheckpointStorageTests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(timeout = 300_000)
|
||||||
|
fun `update only compatible`() {
|
||||||
|
val (id, checkpoint) = newCheckpoint()
|
||||||
|
val serializedFlowState = checkpoint.serializeFlowState()
|
||||||
|
database.transaction {
|
||||||
|
checkpointStorage.addCheckpoint(id, checkpoint, serializedFlowState, checkpoint.serializeCheckpointState())
|
||||||
|
}
|
||||||
|
database.transaction {
|
||||||
|
checkpointStorage.updateCompatible(id, !checkpoint.compatible)
|
||||||
|
}
|
||||||
|
database.transaction {
|
||||||
|
assertEquals(
|
||||||
|
checkpoint.copy(compatible = !checkpoint.compatible),
|
||||||
|
checkpointStorage.checkpoints().single().deserialize()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data class IdAndCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint)
|
data class IdAndCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint)
|
||||||
|
|
||||||
private fun changeStatus(oldCheckpoint: Checkpoint, status: Checkpoint.FlowStatus): IdAndCheckpoint {
|
private fun changeStatus(oldCheckpoint: Checkpoint, status: Checkpoint.FlowStatus): IdAndCheckpoint {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user