CORDA-2522 - Improve error reporting around failed flows (#5016)

* [CORDA-2522] Improve error reporting around failed flows (#5000)

* Improve error reporting around failed flows

* Fix an index to start from 1 when printed

* Address first set of review comments

(cherry picked from commit 24699cd7f499010f07d518381f1ea31b881311b3)

* [CORDA-2522] Follow up changes to error reporting around failed flows (#5006)

* Follow up changes to error reporting around failed flows

* Have FinalityDoctor report stack trace

* Revert changes to the DumpHistoryOnErrorInterceptor

(cherry picked from commit 2da597a5b7744e62888d0c1594814454c2d6ef70)
This commit is contained in:
JamesHR3 2019-04-15 10:25:18 +01:00 committed by Katelyn Baker
parent 0ba7b65ee8
commit ee884a92de
5 changed files with 16 additions and 9 deletions

View File

@ -292,7 +292,7 @@ abstract class TransactionVerificationException(val txId: SecureHash, message: S
class UntrustedAttachmentsException(val txId: SecureHash, val ids: List<SecureHash>) :
CordaException("Attempting to load untrusted transaction attachments: $ids. " +
"At this time these are not loadable because the DJVM sandbox has not yet been integrated. " +
"You will need to install that app version yourself, to whitelist it for use. " +
"You will need to manually install the CorDapp to whitelist it for use. " +
"Please follow the operational steps outlined in https://docs.corda.net/cordapp-build-systems.html#cordapp-contract-attachments to learn more and continue.")
/*

View File

@ -125,8 +125,12 @@ class AttachmentsClassLoader(attachments: List<Attachment>,
}
.map(Attachment::id)
if (untrusted.isNotEmpty())
if (untrusted.isNotEmpty()) {
log.warn("Cannot verify transaction $sampleTxId as the following attachment IDs are untrusted: $untrusted." +
"You will need to manually install the CorDapp to whitelist it for use. " +
"Please follow the operational steps outlined in https://docs.corda.net/cordapp-build-systems.html#cordapp-contract-attachments to learn more and continue.")
throw TransactionVerificationException.UntrustedAttachmentsException(sampleTxId, untrusted)
}
// Enforce the no-overlap and package ownership rules.
checkAttachments(attachments)

View File

@ -243,7 +243,7 @@ class FlowStateMachineImpl<R>(override val id: StateMachineRunId,
if(t.isUnrecoverable()) {
errorAndTerminate("Caught unrecoverable error from flow. Forcibly terminating the JVM, this might leave resources open, and most likely will.", t)
}
logger.info("Flow raised an error... sending it to flow hospital", t)
logger.info("Flow raised an error: ${t.message}. Sending it to flow hospital to be triaged.")
Try.Failure<R>(t)
}
val softLocksId = if (hasSoftLockedStates) logic.runId.uuid else null

View File

@ -107,17 +107,17 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
val (outcome, event, backOffForChronicCondition) = when (report.diagnosis) {
Diagnosis.DISCHARGE -> {
val backOff = calculateBackOffForChronicCondition(report, medicalHistory, currentState)
log.info("Flow ${flowFiber.id} error discharged from hospital (delay ${backOff.seconds}s) by ${report.by}")
log.info("Flow error discharged from hospital (delay ${backOff.seconds}s) by ${report.by} (error was ${report.error.message})")
Triple(Outcome.DISCHARGE, Event.RetryFlowFromSafePoint, backOff)
}
Diagnosis.OVERNIGHT_OBSERVATION -> {
log.info("Flow ${flowFiber.id} error kept for overnight observation by ${report.by}")
log.info("Flow error kept for overnight observation by ${report.by} (error was ${report.error.message})")
// We don't schedule a next event for the flow - it will automatically retry from its checkpoint on node restart
Triple(Outcome.OVERNIGHT_OBSERVATION, null, 0.seconds)
}
Diagnosis.NOT_MY_SPECIALTY -> {
// None of the staff care for these errors so we let them propagate
log.info("Flow ${flowFiber.id} error allowed to propagate")
log.info("Flow error allowed to propagate", report.error)
Triple(Outcome.UNTREATABLE, Event.StartErrorPropagation, 0.seconds)
}
}
@ -160,7 +160,8 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
return errors
.asSequence()
.mapIndexed { index, error ->
log.info("Flow ${flowFiber.id} has error [$index]", error)
// Rely on the logging context to print details of the flow ID.
log.info("Error ${index + 1} of ${errors.size}:", error)
val diagnoses: Map<Diagnosis, List<Staff>> = staff.groupBy { it.consult(flowFiber, currentState, error, medicalHistory) }
// We're only interested in the highest priority diagnosis for the error
val (diagnosis, by) = diagnoses.entries.minBy { it.key }!!
@ -306,7 +307,7 @@ class StaffedFlowHospital(private val flowMessaging: FlowMessaging, private val
override fun consult(flowFiber: FlowFiber, currentState: StateMachineState, newError: Throwable, history: FlowMedicalHistory): Diagnosis {
return if (currentState.flowLogic is FinalityHandler || isFromReceiveFinalityFlow(newError)) {
log.warn("Flow ${flowFiber.id} failed to be finalised. Manual intervention may be required before retrying " +
"the flow by re-starting the node. State machine state: $currentState")
"the flow by re-starting the node. State machine state: $currentState", newError)
Diagnosis.OVERNIGHT_OBSERVATION
} else {
Diagnosis.NOT_MY_SPECIALTY

View File

@ -3,6 +3,7 @@ package net.corda.node.services.statemachine.interceptors
import co.paralleluniverse.fibers.Suspendable
import net.corda.core.flows.StateMachineRunId
import net.corda.core.utilities.contextLogger
import net.corda.core.utilities.debug
import net.corda.node.services.statemachine.ActionExecutor
import net.corda.node.services.statemachine.ErrorState
import net.corda.node.services.statemachine.Event
@ -39,7 +40,8 @@ class DumpHistoryOnErrorInterceptor(val delegate: TransitionExecutor) : Transiti
(record ?: ArrayList()).apply { add(transitionRecord) }
}
// Just if we decide to propagate, and not if just on the way to the hospital.
// Just if we decide to propagate, and not if just on the way to the hospital. Only log at debug level here - the flow transition
// information is often unhelpful in the logs, and the actual cause of the problem will be logged elsewhere.
if (nextState.checkpoint.errorState is ErrorState.Errored && nextState.checkpoint.errorState.propagating) {
log.warn("Flow ${fiber.id} errored, dumping all transitions:\n${record!!.joinToString("\n")}")
for (error in nextState.checkpoint.errorState.errors) {