mirror of
https://github.com/corda/corda.git
synced 2024-12-18 20:47:57 +00:00
Merge pull request #6578 from corda/WillV/ENT-5395-Pause-and-Resume-Flows
ENT-5395 Pause and Resume Flows
This commit is contained in:
commit
f280ec9fd9
@ -23,6 +23,16 @@ interface CheckpointStorage {
|
||||
fun updateCheckpoint(id: StateMachineRunId, checkpoint: Checkpoint, serializedFlowState: SerializedBytes<FlowState>?,
|
||||
serializedCheckpointState: SerializedBytes<CheckpointState>)
|
||||
|
||||
/**
|
||||
* Update an existing checkpoints status ([Checkpoint.status]).
|
||||
*/
|
||||
fun updateStatus(runId: StateMachineRunId, flowStatus: Checkpoint.FlowStatus)
|
||||
|
||||
/**
|
||||
* Update an existing checkpoints compatibility flag ([Checkpoint.compatible]).
|
||||
*/
|
||||
fun updateCompatible(runId: StateMachineRunId, compatible: Boolean)
|
||||
|
||||
/**
|
||||
* Update all persisted checkpoints with status [Checkpoint.FlowStatus.RUNNABLE] or [Checkpoint.FlowStatus.HOSPITALIZED],
|
||||
* changing the status to [Checkpoint.FlowStatus.PAUSED].
|
||||
@ -65,6 +75,4 @@ interface CheckpointStorage {
|
||||
* This method does not fetch [Checkpoint.Serialized.serializedFlowState] to save memory.
|
||||
*/
|
||||
fun getPausedCheckpoints(): Stream<Pair<StateMachineRunId, Checkpoint.Serialized>>
|
||||
|
||||
fun updateStatus(runId: StateMachineRunId, flowStatus: Checkpoint.FlowStatus)
|
||||
}
|
||||
|
@ -504,6 +504,11 @@ class DBCheckpointStorage(
|
||||
currentDBSession().createNativeQuery(update).executeUpdate()
|
||||
}
|
||||
|
||||
override fun updateCompatible(runId: StateMachineRunId, compatible: Boolean) {
|
||||
val update = "Update ${NODE_DATABASE_PREFIX}checkpoints set compatible = $compatible where flow_id = '${runId.uuid}'"
|
||||
currentDBSession().createNativeQuery(update).executeUpdate()
|
||||
}
|
||||
|
||||
private fun createDBFlowMetadata(flowId: String, checkpoint: Checkpoint): DBFlowMetadata {
|
||||
val context = checkpoint.checkpointState.invocationContext
|
||||
val flowInfo = checkpoint.checkpointState.subFlowStack.first()
|
||||
|
@ -57,6 +57,11 @@ sealed class Action {
|
||||
*/
|
||||
data class PersistCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint, val isCheckpointUpdate: Boolean) : Action()
|
||||
|
||||
/**
|
||||
* Update only the [status] of the checkpoint with [id].
|
||||
*/
|
||||
data class UpdateFlowStatus(val id: StateMachineRunId, val status: Checkpoint.FlowStatus): Action()
|
||||
|
||||
/**
|
||||
* Remove the checkpoint corresponding to [id].
|
||||
*/
|
||||
@ -106,6 +111,11 @@ sealed class Action {
|
||||
val lastState: StateMachineState
|
||||
) : Action()
|
||||
|
||||
/**
|
||||
* Move the flow corresponding to [flowId] to paused.
|
||||
*/
|
||||
data class MoveFlowToPaused(val currentState: StateMachineState) : Action()
|
||||
|
||||
/**
|
||||
* Schedule [event] to self.
|
||||
*/
|
||||
|
@ -67,6 +67,8 @@ internal class ActionExecutorImpl(
|
||||
is Action.RetryFlowFromSafePoint -> executeRetryFlowFromSafePoint(action)
|
||||
is Action.ScheduleFlowTimeout -> scheduleFlowTimeout(action)
|
||||
is Action.CancelFlowTimeout -> cancelFlowTimeout(action)
|
||||
is Action.MoveFlowToPaused -> executeMoveFlowToPaused(action)
|
||||
is Action.UpdateFlowStatus -> executeUpdateFlowStatus(action)
|
||||
}
|
||||
}
|
||||
private fun executeReleaseSoftLocks(action: Action.ReleaseSoftLocks) {
|
||||
@ -99,6 +101,11 @@ internal class ActionExecutorImpl(
|
||||
}
|
||||
}
|
||||
|
||||
@Suspendable
|
||||
private fun executeUpdateFlowStatus(action: Action.UpdateFlowStatus) {
|
||||
checkpointStorage.updateStatus(action.id, action.status)
|
||||
}
|
||||
|
||||
@Suspendable
|
||||
private fun executePersistDeduplicationIds(action: Action.PersistDeduplicationFacts) {
|
||||
for (handle in action.deduplicationHandlers) {
|
||||
@ -191,6 +198,11 @@ internal class ActionExecutorImpl(
|
||||
stateMachineManager.removeFlow(action.flowId, action.removalReason, action.lastState)
|
||||
}
|
||||
|
||||
@Suspendable
|
||||
private fun executeMoveFlowToPaused(action: Action.MoveFlowToPaused) {
|
||||
stateMachineManager.moveFlowToPaused(action.currentState)
|
||||
}
|
||||
|
||||
@Suspendable
|
||||
@Throws(SQLException::class)
|
||||
private fun executeCreateTransaction() {
|
||||
|
@ -179,6 +179,13 @@ sealed class Event {
|
||||
override fun toString() = "WakeUpSleepyFlow"
|
||||
}
|
||||
|
||||
/**
|
||||
* Pause the flow.
|
||||
*/
|
||||
object Pause: Event() {
|
||||
override fun toString() = "Pause"
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate the specified [sessions], removing them from in-memory datastructures.
|
||||
*
|
||||
|
@ -19,7 +19,6 @@ import net.corda.core.utilities.contextLogger
|
||||
import net.corda.node.services.api.CheckpointStorage
|
||||
import net.corda.node.services.api.ServiceHubInternal
|
||||
import net.corda.node.services.messaging.DeduplicationHandler
|
||||
import net.corda.node.services.statemachine.FlowStateMachineImpl.Companion.currentStateMachine
|
||||
import net.corda.node.services.statemachine.transitions.StateMachine
|
||||
import net.corda.node.utilities.isEnabledTimedFlow
|
||||
import net.corda.nodeapi.internal.persistence.CordaPersistence
|
||||
@ -29,11 +28,16 @@ import java.util.concurrent.Semaphore
|
||||
|
||||
class Flow<A>(val fiber: FlowStateMachineImpl<A>, val resultFuture: OpenFuture<Any?>)
|
||||
|
||||
class NonResidentFlow(val runId: StateMachineRunId, val checkpoint: Checkpoint) {
|
||||
val externalEvents = mutableListOf<Event.DeliverSessionMessage>()
|
||||
data class NonResidentFlow(
|
||||
val runId: StateMachineRunId,
|
||||
var checkpoint: Checkpoint,
|
||||
val resultFuture: OpenFuture<Any?> = openFuture(),
|
||||
val resumable: Boolean = true
|
||||
) {
|
||||
val events = mutableListOf<ExternalEvent>()
|
||||
|
||||
fun addExternalEvent(message: Event.DeliverSessionMessage) {
|
||||
externalEvents.add(message)
|
||||
fun addExternalEvent(message: ExternalEvent) {
|
||||
events.add(message)
|
||||
}
|
||||
}
|
||||
|
||||
@ -66,18 +70,29 @@ class FlowCreator(
|
||||
}
|
||||
else -> nonResidentFlow.checkpoint
|
||||
}
|
||||
return createFlowFromCheckpoint(nonResidentFlow.runId, checkpoint)
|
||||
return createFlowFromCheckpoint(nonResidentFlow.runId, checkpoint, resultFuture = nonResidentFlow.resultFuture)
|
||||
}
|
||||
|
||||
@Suppress("LongParameterList")
|
||||
fun createFlowFromCheckpoint(
|
||||
runId: StateMachineRunId,
|
||||
oldCheckpoint: Checkpoint,
|
||||
reloadCheckpointAfterSuspendCount: Int? = null,
|
||||
lock: Semaphore = Semaphore(1)
|
||||
lock: Semaphore = Semaphore(1),
|
||||
resultFuture: OpenFuture<Any?> = openFuture(),
|
||||
firstRestore: Boolean = true
|
||||
): Flow<*>? {
|
||||
val checkpoint = oldCheckpoint.copy(status = Checkpoint.FlowStatus.RUNNABLE)
|
||||
val fiber = checkpoint.getFiberFromCheckpoint(runId) ?: return null
|
||||
val resultFuture = openFuture<Any?>()
|
||||
val fiber = oldCheckpoint.getFiberFromCheckpoint(runId, firstRestore)
|
||||
var checkpoint = oldCheckpoint
|
||||
if (fiber == null) {
|
||||
updateCompatibleInDb(runId, false)
|
||||
return null
|
||||
} else if (!oldCheckpoint.compatible) {
|
||||
updateCompatibleInDb(runId, true)
|
||||
checkpoint = checkpoint.copy(compatible = true)
|
||||
}
|
||||
checkpoint = checkpoint.copy(status = Checkpoint.FlowStatus.RUNNABLE)
|
||||
|
||||
fiber.logic.stateMachine = fiber
|
||||
verifyFlowLogicIsSuspendable(fiber.logic)
|
||||
fiber.transientValues = createTransientValues(runId, resultFuture)
|
||||
@ -92,6 +107,12 @@ class FlowCreator(
|
||||
return Flow(fiber, resultFuture)
|
||||
}
|
||||
|
||||
private fun updateCompatibleInDb(runId: StateMachineRunId, compatible: Boolean) {
|
||||
database.transaction {
|
||||
checkpointStorage.updateCompatible(runId, compatible)
|
||||
}
|
||||
}
|
||||
|
||||
@Suppress("LongParameterList")
|
||||
fun <A> createFlowFromLogic(
|
||||
flowId: StateMachineRunId,
|
||||
@ -135,36 +156,45 @@ class FlowCreator(
|
||||
return Flow(flowStateMachineImpl, resultFuture)
|
||||
}
|
||||
|
||||
private fun Checkpoint.getFiberFromCheckpoint(runId: StateMachineRunId): FlowStateMachineImpl<*>? {
|
||||
return when (this.flowState) {
|
||||
is FlowState.Unstarted -> {
|
||||
val logic = tryCheckpointDeserialize(this.flowState.frozenFlowLogic, runId) ?: return null
|
||||
FlowStateMachineImpl(runId, logic, scheduler)
|
||||
}
|
||||
is FlowState.Started -> tryCheckpointDeserialize(this.flowState.frozenFiber, runId) ?: return null
|
||||
// Places calling this function is rely on it to return null if the flow cannot be created from the checkpoint.
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
|
||||
@Suppress("TooGenericExceptionCaught")
|
||||
private inline fun <reified T : Any> tryCheckpointDeserialize(bytes: SerializedBytes<T>, flowId: StateMachineRunId): T? {
|
||||
return try {
|
||||
bytes.checkpointDeserialize(context = checkpointSerializationContext)
|
||||
private fun Checkpoint.getFiberFromCheckpoint(runId: StateMachineRunId, firstRestore: Boolean): FlowStateMachineImpl<*>? {
|
||||
try {
|
||||
return when(flowState) {
|
||||
is FlowState.Unstarted -> {
|
||||
val logic = deserializeFlowState(flowState.frozenFlowLogic)
|
||||
FlowStateMachineImpl(runId, logic, scheduler)
|
||||
}
|
||||
is FlowState.Started -> deserializeFlowState(flowState.frozenFiber)
|
||||
// Places calling this function is rely on it to return null if the flow cannot be created from the checkpoint.
|
||||
else -> return null
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
if (reloadCheckpointAfterSuspend && currentStateMachine() != null) {
|
||||
if (reloadCheckpointAfterSuspend && FlowStateMachineImpl.currentStateMachine() != null) {
|
||||
logger.error(
|
||||
"Unable to deserialize checkpoint for flow $flowId. [reloadCheckpointAfterSuspend] is turned on, throwing exception",
|
||||
e
|
||||
"Unable to deserialize checkpoint for flow $runId. [reloadCheckpointAfterSuspend] is turned on, throwing exception",
|
||||
e
|
||||
)
|
||||
throw ReloadFlowFromCheckpointException(e)
|
||||
} else {
|
||||
logger.error("Unable to deserialize checkpoint for flow $flowId. Something is very wrong and this flow will be ignored.", e)
|
||||
null
|
||||
logSerializationError(firstRestore, runId, e)
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private inline fun <reified T : Any> deserializeFlowState(bytes: SerializedBytes<T>): T {
|
||||
return bytes.checkpointDeserialize(context = checkpointSerializationContext)
|
||||
}
|
||||
|
||||
private fun logSerializationError(firstRestore: Boolean, flowId: StateMachineRunId, exception: Exception) {
|
||||
if (firstRestore) {
|
||||
logger.warn("Flow with id $flowId could not be restored from its checkpoint. Normally this means that a CorDapp has been" +
|
||||
" upgraded without draining the node. To run this flow restart the node after downgrading the CorDapp.", exception)
|
||||
} else {
|
||||
logger.error("Unable to deserialize fiber for flow $flowId. Something is very wrong and this flow will be ignored.", exception)
|
||||
}
|
||||
}
|
||||
|
||||
private fun verifyFlowLogicIsSuspendable(logic: FlowLogic<Any?>) {
|
||||
// Quasar requires (in Java 8) that at least the call method be annotated suspendable. Unfortunately, it's
|
||||
// easy to forget to add this when creating a new flow, so we check here to give the user a better error.
|
||||
@ -219,4 +249,4 @@ class FlowCreator(
|
||||
lock = lock
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package net.corda.node.services.statemachine
|
||||
import co.paralleluniverse.fibers.Fiber
|
||||
import co.paralleluniverse.fibers.FiberExecutorScheduler
|
||||
import co.paralleluniverse.fibers.instrument.JavaAgent
|
||||
import co.paralleluniverse.strands.channels.Channel
|
||||
import com.codahale.metrics.Gauge
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder
|
||||
import net.corda.core.concurrent.CordaFuture
|
||||
@ -18,7 +19,6 @@ import net.corda.core.internal.castIfPossible
|
||||
import net.corda.core.internal.concurrent.map
|
||||
import net.corda.core.internal.concurrent.mapError
|
||||
import net.corda.core.internal.concurrent.openFuture
|
||||
import net.corda.core.internal.mapNotNull
|
||||
import net.corda.core.internal.uncheckedCast
|
||||
import net.corda.core.messaging.DataFeed
|
||||
import net.corda.core.serialization.deserialize
|
||||
@ -55,7 +55,6 @@ import javax.annotation.concurrent.ThreadSafe
|
||||
import kotlin.collections.component1
|
||||
import kotlin.collections.component2
|
||||
import kotlin.collections.set
|
||||
import kotlin.streams.toList
|
||||
|
||||
/**
|
||||
* The StateMachineManagerImpl will always invoke the flow fibers on the given [AffinityExecutor], regardless of which
|
||||
@ -98,7 +97,7 @@ internal class SingleThreadedStateMachineManager(
|
||||
private val flowTimeoutScheduler = FlowTimeoutScheduler(innerState, scheduledFutureExecutor, serviceHub)
|
||||
private val ourSenderUUID = serviceHub.networkService.ourSenderUUID
|
||||
|
||||
private var checkpointSerializationContext: CheckpointSerializationContext? = null
|
||||
private lateinit var checkpointSerializationContext: CheckpointSerializationContext
|
||||
private lateinit var flowCreator: FlowCreator
|
||||
|
||||
override val flowHospital: StaffedFlowHospital = makeFlowHospital()
|
||||
@ -168,12 +167,11 @@ internal class SingleThreadedStateMachineManager(
|
||||
flowTimeoutScheduler::resetCustomTimeout
|
||||
)
|
||||
|
||||
val fibers = restoreFlowsFromCheckpoints()
|
||||
val (fibers, pausedFlows) = restoreFlowsFromCheckpoints()
|
||||
metrics.register("Flows.InFlight", Gauge<Int> { innerState.flows.size })
|
||||
|
||||
setFlowDefaultUncaughtExceptionHandler()
|
||||
|
||||
val pausedFlows = restoreNonResidentFlowsFromPausedCheckpoints()
|
||||
innerState.withLock {
|
||||
this.pausedFlows.putAll(pausedFlows)
|
||||
for ((id, flow) in pausedFlows) {
|
||||
@ -366,30 +364,31 @@ internal class SingleThreadedStateMachineManager(
|
||||
liveFibers.countUp()
|
||||
}
|
||||
|
||||
private fun restoreFlowsFromCheckpoints(): List<Flow<*>> {
|
||||
return checkpointStorage.getCheckpointsToRun().use {
|
||||
it.mapNotNull { (id, serializedCheckpoint) ->
|
||||
// If a flow is added before start() then don't attempt to restore it
|
||||
innerState.withLock { if (id in flows) return@mapNotNull null }
|
||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@mapNotNull null
|
||||
flowCreator.createFlowFromCheckpoint(id, checkpoint)
|
||||
}.toList()
|
||||
private fun restoreFlowsFromCheckpoints(): Pair<MutableMap<StateMachineRunId, Flow<*>>, MutableMap<StateMachineRunId, NonResidentFlow>> {
|
||||
val flows = mutableMapOf<StateMachineRunId, Flow<*>>()
|
||||
val pausedFlows = mutableMapOf<StateMachineRunId, NonResidentFlow>()
|
||||
checkpointStorage.getCheckpointsToRun().forEach Checkpoints@{(id, serializedCheckpoint) ->
|
||||
// If a flow is added before start() then don't attempt to restore it
|
||||
innerState.withLock { if (id in flows) return@Checkpoints }
|
||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@Checkpoints
|
||||
val flow = flowCreator.createFlowFromCheckpoint(id, checkpoint)
|
||||
if (flow == null) {
|
||||
// Set the flowState to paused so we don't waste memory storing it anymore.
|
||||
pausedFlows[id] = NonResidentFlow(id, checkpoint.copy(flowState = FlowState.Paused), resumable = false)
|
||||
} else {
|
||||
flows[id] = flow
|
||||
}
|
||||
}
|
||||
checkpointStorage.getPausedCheckpoints().forEach Checkpoints@{ (id, serializedCheckpoint) ->
|
||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@Checkpoints
|
||||
pausedFlows[id] = NonResidentFlow(id, checkpoint)
|
||||
}
|
||||
return Pair(flows, pausedFlows)
|
||||
}
|
||||
|
||||
private fun restoreNonResidentFlowsFromPausedCheckpoints(): Map<StateMachineRunId, NonResidentFlow> {
|
||||
return checkpointStorage.getPausedCheckpoints().use {
|
||||
it.mapNotNull { (id, serializedCheckpoint) ->
|
||||
// If a flow is added before start() then don't attempt to restore it
|
||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, id) ?: return@mapNotNull null
|
||||
id to NonResidentFlow(id, checkpoint)
|
||||
}.toList().toMap()
|
||||
}
|
||||
}
|
||||
|
||||
private fun resumeRestoredFlows(flows: List<Flow<*>>) {
|
||||
for (flow in flows) {
|
||||
addAndStartFlow(flow.fiber.id, flow)
|
||||
private fun resumeRestoredFlows(flows: Map<StateMachineRunId, Flow<*>>) {
|
||||
for ((id, flow) in flows.entries) {
|
||||
addAndStartFlow(id, flow)
|
||||
}
|
||||
}
|
||||
|
||||
@ -406,19 +405,27 @@ internal class SingleThreadedStateMachineManager(
|
||||
val flow = if (currentState.isAnyCheckpointPersisted) {
|
||||
// We intentionally grab the checkpoint from storage rather than relying on the one referenced by currentState. This is so that
|
||||
// we mirror exactly what happens when restarting the node.
|
||||
val serializedCheckpoint = database.transaction { checkpointStorage.getCheckpoint(flowId) }
|
||||
if (serializedCheckpoint == null) {
|
||||
logger.error("Unable to find database checkpoint for flow $flowId. Something is very wrong. The flow will not retry.")
|
||||
return
|
||||
}
|
||||
val checkpoint = database.transaction {
|
||||
val serializedCheckpoint = checkpointStorage.getCheckpoint(flowId)
|
||||
if (serializedCheckpoint == null) {
|
||||
logger.error("Unable to find database checkpoint for flow $flowId. Something is very wrong. The flow will not retry.")
|
||||
return@transaction null
|
||||
}
|
||||
|
||||
tryDeserializeCheckpoint(serializedCheckpoint, flowId)?.also {
|
||||
if (it.status == Checkpoint.FlowStatus.HOSPITALIZED) {
|
||||
checkpointStorage.updateStatus(flowId, Checkpoint.FlowStatus.RUNNABLE)
|
||||
}
|
||||
} ?: return@transaction null
|
||||
} ?: return
|
||||
|
||||
val checkpoint = tryDeserializeCheckpoint(serializedCheckpoint, flowId) ?: return
|
||||
// Resurrect flow
|
||||
flowCreator.createFlowFromCheckpoint(
|
||||
flowId,
|
||||
checkpoint,
|
||||
currentState.reloadCheckpointAfterSuspendCount,
|
||||
currentState.lock
|
||||
currentState.lock,
|
||||
firstRestore = false
|
||||
) ?: return
|
||||
} else {
|
||||
// Just flow initiation message
|
||||
@ -436,17 +443,56 @@ internal class SingleThreadedStateMachineManager(
|
||||
injectOldProgressTracker(currentState.flowLogic.progressTracker, flow.fiber.logic)
|
||||
addAndStartFlow(flowId, flow)
|
||||
}
|
||||
// Deliver all the external events from the old flow instance.
|
||||
val unprocessedExternalEvents = mutableListOf<ExternalEvent>()
|
||||
do {
|
||||
val event = oldFlowLeftOver.tryReceive()
|
||||
if (event is Event.GeneratedByExternalEvent) {
|
||||
unprocessedExternalEvents += event.deduplicationHandler.externalCause
|
||||
}
|
||||
} while (event != null)
|
||||
val externalEvents = currentState.pendingDeduplicationHandlers.map { it.externalCause } + unprocessedExternalEvents
|
||||
for (externalEvent in externalEvents) {
|
||||
deliverExternalEvent(externalEvent)
|
||||
extractAndScheduleEventsForRetry(oldFlowLeftOver, currentState)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all the [ExternalEvent] from this flows event queue and queue them (in the correct order) in the PausedFlow.
|
||||
* This differs from [extractAndScheduleEventsForRetry] which also extracts (and schedules) [Event.Pause]. This means that if there are
|
||||
* more events in the flows eventQueue then the flow won't pause again (after it is retried). These events are then scheduled (along
|
||||
* with any [ExistingSessionMessage] which arrive in the interim) when the flow is retried.
|
||||
*/
|
||||
private fun extractAndQueueExternalEventsForPausedFlow(
|
||||
currentEventQueue: Channel<Event>,
|
||||
currentPendingDeduplicationHandlers: List<DeduplicationHandler>,
|
||||
pausedFlow: NonResidentFlow
|
||||
) {
|
||||
pausedFlow.events += currentPendingDeduplicationHandlers.map{it.externalCause}
|
||||
do {
|
||||
val event = currentEventQueue.tryReceive()
|
||||
if (event is Event.GeneratedByExternalEvent) {
|
||||
pausedFlow.events.add(event.deduplicationHandler.externalCause)
|
||||
}
|
||||
} while (event != null)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extract all the incomplete deduplication handlers as well as the [ExternalEvent] and [Event.Pause] events from this flows event queue
|
||||
* [oldEventQueue]. Then schedule them (in the same order) for the new flow. This means that if a retried flow has a pause event
|
||||
* scheduled then the retried flow will eventually pause. The new flow will not retry again if future retry events have been scheduled.
|
||||
* When this method is called this flow must have been replaced by the new flow in [StateMachineInnerState.flows]. This method differs
|
||||
* from [extractAndQueueExternalEventsForPausedFlow] where (only) [externalEvents] are extracted and scheduled straight away.
|
||||
*/
|
||||
private fun extractAndScheduleEventsForRetry(oldEventQueue: Channel<Event>, currentState: StateMachineState) {
|
||||
val flow = innerState.withLock {
|
||||
flows[currentState.flowLogic.runId]
|
||||
}
|
||||
val events = mutableListOf<Event>()
|
||||
do {
|
||||
val event = oldEventQueue.tryReceive()
|
||||
if (event is Event.Pause || event is Event.GeneratedByExternalEvent) events.add(event)
|
||||
} while (event != null)
|
||||
|
||||
for (externalEvent in currentState.pendingDeduplicationHandlers) {
|
||||
deliverExternalEvent(externalEvent.externalCause)
|
||||
}
|
||||
for (event in events) {
|
||||
if (event is Event.GeneratedByExternalEvent) {
|
||||
deliverExternalEvent(event.deduplicationHandler.externalCause)
|
||||
} else {
|
||||
flow?.fiber?.scheduleEvent(event)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -485,7 +531,7 @@ internal class SingleThreadedStateMachineManager(
|
||||
val sender = serviceHub.networkMapCache.getPeerByLegalName(peer)
|
||||
if (sender != null) {
|
||||
when (sessionMessage) {
|
||||
is ExistingSessionMessage -> onExistingSessionMessage(sessionMessage, event.deduplicationHandler, sender)
|
||||
is ExistingSessionMessage -> onExistingSessionMessage(sessionMessage, sender, event)
|
||||
is InitialSessionMessage -> onSessionInit(sessionMessage, sender, event)
|
||||
}
|
||||
} else {
|
||||
@ -495,8 +541,13 @@ internal class SingleThreadedStateMachineManager(
|
||||
}
|
||||
}
|
||||
|
||||
private fun onExistingSessionMessage(sessionMessage: ExistingSessionMessage, deduplicationHandler: DeduplicationHandler, sender: Party) {
|
||||
private fun onExistingSessionMessage(
|
||||
sessionMessage: ExistingSessionMessage,
|
||||
sender: Party,
|
||||
externalEvent: ExternalEvent.ExternalMessageEvent
|
||||
) {
|
||||
try {
|
||||
val deduplicationHandler = externalEvent.deduplicationHandler
|
||||
val recipientId = sessionMessage.recipientSessionId
|
||||
val flowId = sessionToFlow[recipientId]
|
||||
if (flowId == null) {
|
||||
@ -515,7 +566,7 @@ internal class SingleThreadedStateMachineManager(
|
||||
innerState.withLock {
|
||||
flows[flowId]?.run { fiber.scheduleEvent(event) }
|
||||
// If flow is not running add it to the list of external events to be processed if/when the flow resumes.
|
||||
?: pausedFlows[flowId]?.run { addExternalEvent(event) }
|
||||
?: pausedFlows[flowId]?.run { addExternalEvent(externalEvent) }
|
||||
?: logger.info("Cannot find fiber corresponding to flow ID $flowId")
|
||||
}
|
||||
}
|
||||
@ -632,7 +683,16 @@ internal class SingleThreadedStateMachineManager(
|
||||
null
|
||||
}
|
||||
|
||||
val flow = flowCreator.createFlowFromLogic(flowId, invocationContext, flowLogic, flowStart, ourIdentity, existingCheckpoint, deduplicationHandler, ourSenderUUID)
|
||||
val flow = flowCreator.createFlowFromLogic(
|
||||
flowId,
|
||||
invocationContext,
|
||||
flowLogic,
|
||||
flowStart,
|
||||
ourIdentity,
|
||||
existingCheckpoint,
|
||||
deduplicationHandler,
|
||||
ourSenderUUID
|
||||
)
|
||||
val startedFuture = openFuture<Unit>()
|
||||
innerState.withLock {
|
||||
startedFutures[flowId] = startedFuture
|
||||
@ -650,9 +710,29 @@ internal class SingleThreadedStateMachineManager(
|
||||
flowTimeoutScheduler.cancel(flowId)
|
||||
}
|
||||
|
||||
override fun moveFlowToPaused(currentState: StateMachineState) {
|
||||
currentState.cancelFutureIfRunning()
|
||||
flowTimeoutScheduler.cancel(currentState.flowLogic.runId)
|
||||
innerState.withLock {
|
||||
val id = currentState.flowLogic.runId
|
||||
val flow = flows.remove(id)
|
||||
if (flow != null) {
|
||||
decrementLiveFibers()
|
||||
//Setting flowState = FlowState.Paused means we don't hold the frozen fiber in memory.
|
||||
val checkpoint = currentState.checkpoint.copy(status = Checkpoint.FlowStatus.PAUSED, flowState = FlowState.Paused)
|
||||
val pausedFlow = NonResidentFlow(id, checkpoint, flow.resultFuture)
|
||||
val eventQueue = flow.fiber.transientValues.eventQueue
|
||||
extractAndQueueExternalEventsForPausedFlow(eventQueue, currentState.pendingDeduplicationHandlers, pausedFlow)
|
||||
pausedFlows.put(id, pausedFlow)
|
||||
} else {
|
||||
logger.warn("Flow $id already removed before pausing")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun tryDeserializeCheckpoint(serializedCheckpoint: Checkpoint.Serialized, flowId: StateMachineRunId): Checkpoint? {
|
||||
return try {
|
||||
serializedCheckpoint.deserialize(checkpointSerializationContext!!)
|
||||
serializedCheckpoint.deserialize(checkpointSerializationContext)
|
||||
} catch (e: Exception) {
|
||||
if (reloadCheckpointAfterSuspend && currentStateMachine() != null) {
|
||||
logger.error(
|
||||
|
@ -104,6 +104,16 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging,
|
||||
*/
|
||||
private val flowsInHospital = ConcurrentHashMap<StateMachineRunId, FlowFiber>()
|
||||
|
||||
/**
|
||||
* Returns true if the flow is currently being treated in the hospital.
|
||||
* The differs to flows with a medical history (which can accessed via [StaffedFlowHospital.contains]).
|
||||
*/
|
||||
@VisibleForTesting
|
||||
internal fun flowInHospital(runId: StateMachineRunId): Boolean {
|
||||
// The .keys avoids https://youtrack.jetbrains.com/issue/KT-18053
|
||||
return runId in flowsInHospital.keys
|
||||
}
|
||||
|
||||
private val mutex = ThreadBox(object {
|
||||
/**
|
||||
* Contains medical history of every flow (a patient) that has entered the hospital. A flow can leave the hospital,
|
||||
|
@ -106,6 +106,7 @@ internal interface StateMachineManagerInternal {
|
||||
fun addSessionBinding(flowId: StateMachineRunId, sessionId: SessionId)
|
||||
fun removeSessionBindings(sessionIds: Set<SessionId>)
|
||||
fun removeFlow(flowId: StateMachineRunId, removalReason: FlowRemovalReason, lastState: StateMachineState)
|
||||
fun moveFlowToPaused(currentState: StateMachineState)
|
||||
fun retryFlowFromSafePoint(currentState: StateMachineState)
|
||||
fun scheduleFlowTimeout(flowId: StateMachineRunId)
|
||||
fun cancelFlowTimeout(flowId: StateMachineRunId)
|
||||
|
@ -62,6 +62,7 @@ class TopLevelTransition(
|
||||
is Event.ReloadFlowFromCheckpointAfterSuspend -> reloadFlowFromCheckpointAfterSuspendTransition()
|
||||
is Event.OvernightObservation -> overnightObservationTransition()
|
||||
is Event.WakeUpFromSleep -> wakeUpFromSleepTransition()
|
||||
is Event.Pause -> pausedFlowTransition()
|
||||
is Event.TerminateSessions -> terminateSessionsTransition(event)
|
||||
}
|
||||
}
|
||||
@ -368,6 +369,22 @@ class TopLevelTransition(
|
||||
}
|
||||
}
|
||||
|
||||
private fun pausedFlowTransition(): TransitionResult {
|
||||
return builder {
|
||||
if (!startingState.isFlowResumed) {
|
||||
actions.add(Action.CreateTransaction)
|
||||
}
|
||||
actions.addAll(
|
||||
arrayOf(
|
||||
Action.UpdateFlowStatus(context.id, Checkpoint.FlowStatus.PAUSED),
|
||||
Action.CommitTransaction,
|
||||
Action.MoveFlowToPaused(currentState)
|
||||
)
|
||||
)
|
||||
FlowContinuation.Abort
|
||||
}
|
||||
}
|
||||
|
||||
private fun terminateSessionsTransition(event: Event.TerminateSessions): TransitionResult {
|
||||
return builder {
|
||||
val sessions = event.sessions
|
||||
|
@ -799,8 +799,8 @@ class DBCheckpointStorageTests {
|
||||
val (extractedId, extractedCheckpoint) = checkpointStorage.getPausedCheckpoints().toList().single()
|
||||
assertEquals(id, extractedId)
|
||||
//We don't extract the result or the flowstate from a paused checkpoint
|
||||
assertEquals(null, extractedCheckpoint.serializedFlowState)
|
||||
assertEquals(null, extractedCheckpoint.result)
|
||||
assertNull(extractedCheckpoint.serializedFlowState)
|
||||
assertNull(extractedCheckpoint.result)
|
||||
|
||||
assertEquals(pausedCheckpoint.status, extractedCheckpoint.status)
|
||||
assertEquals(pausedCheckpoint.progressStep, extractedCheckpoint.progressStep)
|
||||
@ -890,6 +890,24 @@ class DBCheckpointStorageTests {
|
||||
}
|
||||
}
|
||||
|
||||
@Test(timeout = 300_000)
|
||||
fun `update only compatible`() {
|
||||
val (id, checkpoint) = newCheckpoint()
|
||||
val serializedFlowState = checkpoint.serializeFlowState()
|
||||
database.transaction {
|
||||
checkpointStorage.addCheckpoint(id, checkpoint, serializedFlowState, checkpoint.serializeCheckpointState())
|
||||
}
|
||||
database.transaction {
|
||||
checkpointStorage.updateCompatible(id, !checkpoint.compatible)
|
||||
}
|
||||
database.transaction {
|
||||
assertEquals(
|
||||
checkpoint.copy(compatible = !checkpoint.compatible),
|
||||
checkpointStorage.checkpoints().single().deserialize()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
data class IdAndCheckpoint(val id: StateMachineRunId, val checkpoint: Checkpoint)
|
||||
|
||||
private fun changeStatus(oldCheckpoint: Checkpoint, status: Checkpoint.FlowStatus): IdAndCheckpoint {
|
||||
|
Loading…
Reference in New Issue
Block a user