ENT-1625 Fix overly large footprint of checkpoint bandwidth histogram (#580)

* Upgrade metrics library and histogram implementation

* Only write to the "per second" checkpoint bandwidth histogram once per second, maximum.

* Bug fix

* Review feedback and fix unit test

* Fix IRS demo to use specific metrics version in spring boot
This commit is contained in:
Rick Parker 2018-03-21 09:52:29 +00:00 committed by GitHub
parent 5def901980
commit 614d2049ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 27 additions and 21 deletions

View File

@ -59,6 +59,7 @@ buildscript {
ext.bouncycastle_version = constants.getProperty("bouncycastleVersion")
ext.guava_version = constants.getProperty("guavaVersion")
ext.caffeine_version = constants.getProperty("caffeineVersion")
ext.metrics_version = constants.getProperty("metricsVersion")
ext.okhttp_version = '3.5.0'
ext.netty_version = '4.1.9.Final'
ext.typesafe_config_version = constants.getProperty("typesafeConfigVersion")

View File

@ -18,3 +18,5 @@ jsr305Version=3.0.2
artifactoryPluginVersion=4.4.18
snakeYamlVersion=1.19
caffeineVersion=2.6.2
metricsVersion=3.2.5

View File

@ -119,8 +119,8 @@ dependencies {
}
// Coda Hale's Metrics: for monitoring of key statistics
compile "io.dropwizard.metrics:metrics-core:3.1.2"
compile group: 'io.dropwizard.metrics', name: 'metrics-graphite', version: '3.1.2'
compile "io.dropwizard.metrics:metrics-core:$metrics_version"
compile "io.dropwizard.metrics:metrics-graphite:$metrics_version"
// JimFS: in memory java.nio filesystem. Used for test and simulation utilities.
compile "com.google.jimfs:jimfs:1.1"

View File

@ -12,10 +12,7 @@ package net.corda.node.services.statemachine
import co.paralleluniverse.fibers.Fiber
import co.paralleluniverse.fibers.Suspendable
import com.codahale.metrics.Gauge
import com.codahale.metrics.Histogram
import com.codahale.metrics.MetricRegistry
import com.codahale.metrics.SlidingTimeWindowReservoir
import com.codahale.metrics.*
import net.corda.core.internal.concurrent.thenMatch
import net.corda.core.serialization.SerializationContext
import net.corda.core.serialization.SerializedBytes
@ -30,6 +27,7 @@ import net.corda.nodeapi.internal.persistence.contextTransactionOrNull
import java.time.Duration
import java.time.Instant
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicLong
/**
* This is the bottom execution engine of flow side-effects.
@ -47,23 +45,20 @@ class ActionExecutorImpl(
val log = contextLogger()
}
private class LatchedGauge : Gauge<Long> {
private var value: Long = 0
fun update(value: Long) {
this.value = value
}
/**
* This [Gauge] just reports the sum of the bytes checkpointed during the last second.
*/
private class LatchedGauge(private val reservoir: Reservoir) : Gauge<Long> {
override fun getValue(): Long {
val retVal = value
value = 0
return retVal
return reservoir.snapshot.values.sum()
}
}
private val checkpointingMeter = metrics.meter("Flows.Checkpointing Rate")
private val checkpointSizesThisSecond = SlidingTimeWindowReservoir(1, TimeUnit.SECONDS)
private val checkpointBandwidthHist = metrics.register("Flows.CheckpointVolumeBytesPerSecondHist", Histogram(SlidingTimeWindowReservoir(1, TimeUnit.DAYS)))
private val checkpointBandwidth = metrics.register("Flows.CheckpointVolumeBytesPerSecondCurrent", LatchedGauge())
private val lastBandwidthUpdate = AtomicLong(0)
private val checkpointBandwidthHist = metrics.register("Flows.CheckpointVolumeBytesPerSecondHist", Histogram(SlidingTimeWindowArrayReservoir(1, TimeUnit.DAYS)))
private val checkpointBandwidth = metrics.register("Flows.CheckpointVolumeBytesPerSecondCurrent", LatchedGauge(checkpointSizesThisSecond))
@Suspendable
override fun executeAction(fiber: FlowFiber, action: Action) {
@ -107,9 +102,14 @@ class ActionExecutorImpl(
checkpointStorage.addCheckpoint(action.id, checkpointBytes)
checkpointingMeter.mark()
checkpointSizesThisSecond.update(checkpointBytes.size.toLong())
val checkpointVolume = checkpointSizesThisSecond.snapshot.values.sum()
checkpointBandwidthHist.update(checkpointVolume)
checkpointBandwidth.update(checkpointVolume)
var lastUpdateTime = lastBandwidthUpdate.get()
while (System.nanoTime() - lastUpdateTime > TimeUnit.SECONDS.toNanos(1)) {
if (lastBandwidthUpdate.compareAndSet(lastUpdateTime, System.nanoTime())) {
val checkpointVolume = checkpointSizesThisSecond.snapshot.values.sum()
checkpointBandwidthHist.update(checkpointVolume)
}
lastUpdateTime = lastBandwidthUpdate.get()
}
}
@Suspendable

View File

@ -28,6 +28,7 @@ ext['artemis.version'] = "$artemis_version"
ext['hibernate.version'] = "$hibernate_version"
ext['selenium.version'] = "$selenium_version"
ext['jackson.version'] = "$jackson_version"
ext['dropwizard-metrics.version'] = "$metrics_version"
apply plugin: 'java'
apply plugin: 'kotlin'

View File

@ -25,9 +25,11 @@ buildscript {
// causing the problems in runtime. Those can be changed by manipulating above properties
// See https://github.com/spring-gradle-plugins/dependency-management-plugin/blob/master/README.md#changing-the-value-of-a-version-property
// This has to be repeated here as otherwise the order of files does matter
// See a list here: https://github.com/spring-projects/spring-boot/blob/master/spring-boot-project/spring-boot-dependencies/pom.xml
ext['artemis.version'] = "$artemis_version"
ext['hibernate.version'] = "$hibernate_version"
ext['jackson.version'] = "$jackson_version"
ext['dropwizard-metrics.version'] = "$metrics_version"
apply plugin: 'java'

View File

@ -30,7 +30,7 @@ dependencies {
cordaCompile project(':client:rpc')
cordaCompile project(':node-driver')
compile group: 'mysql', name: 'mysql-connector-java', version: '6.0.6'
compile group: 'io.dropwizard.metrics', name: 'metrics-graphite', version: '3.2.5'
compile "io.dropwizard.metrics:metrics-graphite:$metrics_version"
}