Do not black-list AMQP targets that suffer a handshake failure

This commit is contained in:
Chris Cochrane
2022-09-13 11:41:19 +01:00
parent 242d7d45c5
commit 5ca5b8d096
6 changed files with 183 additions and 26 deletions

View File

@ -58,7 +58,7 @@ internal class AMQPChannelHandler(private val serverMode: Boolean,
private var remoteCert: X509Certificate? = null
private var eventProcessor: EventProcessor? = null
private var suppressClose: Boolean = false
private var badCert: Boolean = false
private var connectionResult: ConnectionResult = ConnectionResult.NO_ERROR
private var localCert: X509Certificate? = null
private var requestedServerName: String? = null
@ -131,7 +131,7 @@ internal class AMQPChannelHandler(private val serverMode: Boolean,
val ch = ctx.channel()
logInfoWithMDC { "Closed client connection ${ch.id()} from $remoteAddress to ${ch.localAddress()}" }
if (!suppressClose) {
onClose(ch as SocketChannel, ConnectionChange(remoteAddress, remoteCert, false, badCert))
onClose(ch as SocketChannel, ConnectionChange(remoteAddress, remoteCert, false, connectionResult))
}
eventProcessor?.close()
ctx.fireChannelInactive()
@ -274,13 +274,13 @@ internal class AMQPChannelHandler(private val serverMode: Boolean,
val remoteX500Name = try {
CordaX500Name.build(remoteCert!!.subjectX500Principal)
} catch (ex: IllegalArgumentException) {
badCert = true
connectionResult = ConnectionResult.HANDSHAKE_FAILURE
logErrorWithMDC("Certificate subject not a valid CordaX500Name", ex)
ctx.close()
return
}
if (allowedRemoteLegalNames != null && remoteX500Name !in allowedRemoteLegalNames) {
badCert = true
connectionResult = ConnectionResult.HANDSHAKE_FAILURE
logErrorWithMDC("Provided certificate subject $remoteX500Name not in expected set $allowedRemoteLegalNames")
ctx.close()
return
@ -288,7 +288,7 @@ internal class AMQPChannelHandler(private val serverMode: Boolean,
logInfoWithMDC { "Handshake completed with subject: $remoteX500Name, requested server name: ${sslHandler.getRequestedServerName()}." }
createAMQPEngine(ctx)
onOpen(ctx.channel() as SocketChannel, ConnectionChange(remoteAddress, remoteCert, connected = true, badCert = false))
onOpen(ctx.channel() as SocketChannel, ConnectionChange(remoteAddress, remoteCert, connected = true, connectionResult = ConnectionResult.NO_ERROR))
}
private fun handleFailedHandshake(ctx: ChannelHandlerContext, evt: SslHandshakeCompletionEvent) {
@ -303,7 +303,7 @@ internal class AMQPChannelHandler(private val serverMode: Boolean,
// io.netty.handler.ssl.SslHandler.setHandshakeFailureTransportFailure()
cause is SSLException && (cause.message?.contains("writing TLS control frames") == true) -> logWarnWithMDC(cause.message!!)
cause is SSLException && (cause.message?.contains("internal_error") == true) -> logWarnWithMDC("Received internal_error during handshake")
else -> badCert = true
else -> connectionResult = ConnectionResult.HANDSHAKE_FAILURE
}
logWarnWithMDC("Handshake failure: ${evt.cause().message}")
if (log.isTraceEnabled) {

View File

@ -26,6 +26,7 @@ import rx.Observable
import rx.subjects.PublishSubject
import java.lang.Long.min
import java.net.InetSocketAddress
import java.time.Duration
import java.util.concurrent.TimeUnit
import java.util.concurrent.locks.ReentrantLock
import javax.net.ssl.KeyManagerFactory
@ -70,6 +71,7 @@ class AMQPClient(val targets: List<NetworkHostAndPort>,
private const val MAX_RETRY_INTERVAL = 60000L
private const val BACKOFF_MULTIPLIER = 2L
private val NUM_CLIENT_THREADS = Integer.getInteger(CORDA_AMQP_NUM_CLIENT_THREAD_PROP_NAME, 2)
private val handshakeRetryIntervals = List(5) { Duration.ofMinutes(5) }
}
private val lock = ReentrantLock()
@ -82,7 +84,9 @@ class AMQPClient(val targets: List<NetworkHostAndPort>,
private var targetIndex = 0
private var currentTarget: NetworkHostAndPort = targets.first()
private var retryInterval = MIN_RETRY_INTERVAL
private val badCertTargets = mutableSetOf<NetworkHostAndPort>()
private val handshakeFailureRetryTargets = mutableSetOf<NetworkHostAndPort>()
private var retryingHandshakeFailures = false
private var retryOffset = 0
@Volatile
private var amqpActive = false
@Volatile
@ -91,22 +95,67 @@ class AMQPClient(val targets: List<NetworkHostAndPort>,
val localAddressString: String
get() = clientChannel?.localAddress()?.toString() ?: "<unknownLocalAddress>"
private fun nextTarget() {
/*
Figure out the index of the next address to try to connect to
*/
private fun setTargetIndex() {
val origIndex = targetIndex
targetIndex = -1
for (offset in 1..targets.size) {
val newTargetIndex = (origIndex + offset).rem(targets.size)
if (targets[newTargetIndex] !in badCertTargets) {
if (targets[newTargetIndex] !in handshakeFailureRetryTargets ) {
targetIndex = newTargetIndex
break
}
}
if (targetIndex == -1) {
log.error("No targets have presented acceptable certificates for $allowedRemoteLegalNames. Halting retries")
return
}
/*
Set how long to wait until trying to connect to the next address
*/
private fun setTargetRetryInterval() {
retryInterval = if (retryingHandshakeFailures) {
if (retryOffset < handshakeRetryIntervals.size) {
handshakeRetryIntervals[retryOffset++].toMillis()
} else {
Duration.ofDays(1).toMillis()
}
} else {
min(MAX_RETRY_INTERVAL, retryInterval * BACKOFF_MULTIPLIER)
}
log.info("Retry connect to ${targets[targetIndex]}")
retryInterval = min(MAX_RETRY_INTERVAL, retryInterval * BACKOFF_MULTIPLIER)
}
/*
Once a connection is made, reset all the retry-connection info so if there is another connection failure
then this node tries to reconnect quickly.
*/
private fun successfullyConnected() {
log.info("Successfully connected to [${targets[targetIndex]}]; resetting the target connection-retry interval")
retryingHandshakeFailures = false
retryInterval = MIN_RETRY_INTERVAL
retryOffset = 0
}
/*
Set the next target to connect to
*/
private fun nextTarget() {
setTargetIndex()
if (targetIndex == -1) {
if (handshakeFailureRetryTargets.isNotEmpty()) {
log.info("Failed to connect to any targets. Retrying targets that previously failed to handshake.")
handshakeFailureRetryTargets.clear()
retryingHandshakeFailures = true
setTargetIndex()
} else {
log.error("Attempted connection to targets: $targets, but none of them have presented acceptable certificates" +
" for $allowedRemoteLegalNames. Halting retries.")
return
}
}
setTargetRetryInterval()
log.info("Retry connect to ${targets[targetIndex]} in [$retryInterval] ms")
}
private val connectListener = object : ChannelFutureListener {
@ -212,7 +261,7 @@ class AMQPClient(val targets: List<NetworkHostAndPort>,
onOpen = { _, change ->
parent.run {
amqpActive = true
retryInterval = MIN_RETRY_INTERVAL // reset to fast reconnect if we connect properly
successfullyConnected()
_onConnection.onNext(change)
}
},
@ -220,9 +269,9 @@ class AMQPClient(val targets: List<NetworkHostAndPort>,
if (parent.amqpChannelHandler == amqpChannelHandler) {
parent.run {
_onConnection.onNext(change)
if (change.badCert) {
log.error("Blocking future connection attempts to $target due to bad certificate on endpoint")
badCertTargets += target
if (change.connectionResult == ConnectionResult.HANDSHAKE_FAILURE) {
log.warn("Handshake failure with $target target; will retry later")
handshakeFailureRetryTargets += target
}
if (started && amqpActive) {

View File

@ -3,8 +3,8 @@ package net.corda.nodeapi.internal.protonwrapper.netty
import java.net.InetSocketAddress
import java.security.cert.X509Certificate
data class ConnectionChange(val remoteAddress: InetSocketAddress, val remoteCert: X509Certificate?, val connected: Boolean, val badCert: Boolean) {
data class ConnectionChange(val remoteAddress: InetSocketAddress, val remoteCert: X509Certificate?, val connected: Boolean, val connectionResult: ConnectionResult) {
override fun toString(): String {
return "ConnectionChange remoteAddress: $remoteAddress connected state: $connected cert subject: ${remoteCert?.subjectDN} cert ok: ${!badCert}"
return "ConnectionChange remoteAddress: $remoteAddress connected state: $connected cert subject: ${remoteCert?.subjectDN} result: ${connectionResult}"
}
}
}

View File

@ -0,0 +1,6 @@
package net.corda.nodeapi.internal.protonwrapper.netty
enum class ConnectionResult {
NO_ERROR,
HANDSHAKE_FAILURE
}