[CORDA-2522] Follow up changes to error reporting around failed flows (#5006)

* Follow up changes to error reporting around failed flows

* Have FinalityDoctor report stack trace

* Revert changes to the DumpHistoryOnErrorInterceptor
This commit is contained in:
JamesHR3 2019-04-12 16:12:28 +01:00 committed by Rick Parker
parent bf66fa67ce
commit 2da597a5b7
2 changed files with 8 additions and 7 deletions

View File

@ -107,17 +107,17 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
val (outcome, event, backOffForChronicCondition) = when (report.diagnosis) {
Diagnosis.DISCHARGE -> {
val backOff = calculateBackOffForChronicCondition(report, medicalHistory, currentState)
log.info("Flow ${flowFiber.id} error discharged from hospital (delay ${backOff.seconds}s) by ${report.by}")
log.info("Flow error discharged from hospital (delay ${backOff.seconds}s) by ${report.by} (error was ${report.error.message})")
Triple(Outcome.DISCHARGE, Event.RetryFlowFromSafePoint, backOff)
}
Diagnosis.OVERNIGHT_OBSERVATION -> {
log.info("Flow ${flowFiber.id} error kept for overnight observation by ${report.by}")
log.info("Flow error kept for overnight observation by ${report.by} (error was ${report.error.message})")
// We don't schedule a next event for the flow - it will automatically retry from its checkpoint on node restart
Triple(Outcome.OVERNIGHT_OBSERVATION, null, 0.seconds)
}
Diagnosis.NOT_MY_SPECIALTY -> {
// None of the staff care for these errors so we let them propagate
log.info("Flow ${flowFiber.id} error allowed to propagate")
log.info("Flow error allowed to propagate", report.error)
Triple(Outcome.UNTREATABLE, Event.StartErrorPropagation, 0.seconds)
}
}
@ -161,7 +161,7 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
.asSequence()
.mapIndexed { index, error ->
// Rely on the logging context to print details of the flow ID.
log.warn("Error ${index + 1} of ${errors.size}:", error)
log.info("Error ${index + 1} of ${errors.size}:", error)
val diagnoses: Map<Diagnosis, List<Staff>> = staff.groupBy { it.consult(flowFiber, currentState, error, medicalHistory) }
// We're only interested in the highest priority diagnosis for the error
val (diagnosis, by) = diagnoses.entries.minBy { it.key }!!
@ -307,7 +307,7 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
override fun consult(flowFiber: FlowFiber, currentState: StateMachineState, newError: Throwable, history: FlowMedicalHistory): Diagnosis {
return if (currentState.flowLogic is FinalityHandler || isFromReceiveFinalityFlow(newError)) {
log.warn("Flow ${flowFiber.id} failed to be finalised. Manual intervention may be required before retrying " +
"the flow by re-starting the node. State machine state: $currentState")
"the flow by re-starting the node. State machine state: $currentState", newError)
Diagnosis.OVERNIGHT_OBSERVATION
} else {
Diagnosis.NOT_MY_SPECIALTY

View File

@ -3,6 +3,7 @@ package net.corda.node.services.statemachine.interceptors
import co.paralleluniverse.fibers.Suspendable
import net.corda.core.flows.StateMachineRunId
import net.corda.core.utilities.contextLogger
import net.corda.core.utilities.debug
import net.corda.node.services.statemachine.ActionExecutor
import net.corda.node.services.statemachine.ErrorState
import net.corda.node.services.statemachine.Event
@ -42,9 +43,9 @@ class DumpHistoryOnErrorInterceptor(val delegate: TransitionExecutor) : Transiti
// Just if we decide to propagate, and not if just on the way to the hospital. Only log at debug level here - the flow transition
// information is often unhelpful in the logs, and the actual cause of the problem will be logged elsewhere.
if (nextState.checkpoint.errorState is ErrorState.Errored && nextState.checkpoint.errorState.propagating) {
log.debug("Flow ${fiber.id} errored, dumping all transitions:\n${record!!.joinToString("\n")}")
log.warn("Flow ${fiber.id} errored, dumping all transitions:\n${record!!.joinToString("\n")}")
for (error in nextState.checkpoint.errorState.errors) {
log.debug("Flow ${fiber.id} error", error.exception)
log.warn("Flow ${fiber.id} error", error.exception)
}
}