mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2024-12-18 20:47:53 +00:00
Peer metrics (#1995)
* Adding peer metrics still need to be wired up for use * per peer packet metrics * Fix crash from bad instantiation of histogram * separate alive & dead path counts * Add peer metric update block * add peer latency values in doPingAndKeepalive * prevent deadlock * peer latency histogram actually works now * cleanup * capture counts of packets to specific peers --------- Co-authored-by: Joseph Henry <joseph.henry@zerotier.com>
This commit is contained in:
parent
925599cab0
commit
74dc41c7c7
@ -28,15 +28,9 @@ namespace prometheus {
|
||||
/// a data race.
|
||||
template <typename Value_ = uint64_t>
|
||||
class Histogram : public Metric {
|
||||
|
||||
using BucketBoundaries = std::vector<Value_>;
|
||||
|
||||
const BucketBoundaries bucket_boundaries_;
|
||||
std::vector<Counter<Value_>> bucket_counts_;
|
||||
Gauge<Value_> sum_;
|
||||
|
||||
public:
|
||||
using Value = Value_;
|
||||
using BucketBoundaries = std::vector<Value_>;
|
||||
using Family = CustomFamily<Histogram<Value>>;
|
||||
|
||||
static const Metric::Type static_type = Metric::Type::Histogram;
|
||||
@ -69,7 +63,7 @@ namespace prometheus {
|
||||
bucket_boundaries_.begin(),
|
||||
std::find_if(
|
||||
std::begin(bucket_boundaries_), std::end(bucket_boundaries_),
|
||||
[value](const double boundary) { return boundary >= value; })));
|
||||
[value](const Value boundary) { return boundary >= value; })));
|
||||
sum_.Increment(value);
|
||||
bucket_counts_[bucket_index].Increment();
|
||||
}
|
||||
@ -110,7 +104,7 @@ namespace prometheus {
|
||||
bucket.cumulative_count = cumulative_count;
|
||||
bucket.upper_bound = (i == bucket_boundaries_.size()
|
||||
? std::numeric_limits<double>::infinity()
|
||||
: bucket_boundaries_[i]);
|
||||
: static_cast<double>(bucket_boundaries_[i]));
|
||||
metric.histogram.bucket.push_back(std::move(bucket));
|
||||
}
|
||||
metric.histogram.sample_count = cumulative_count;
|
||||
@ -119,6 +113,12 @@ namespace prometheus {
|
||||
return metric;
|
||||
}
|
||||
|
||||
private:
|
||||
const BucketBoundaries bucket_boundaries_;
|
||||
std::vector<Counter<Value_>> bucket_counts_;
|
||||
Gauge<Value_> sum_;
|
||||
|
||||
|
||||
};
|
||||
|
||||
/// \brief Return a builder to configure and register a Histogram metric.
|
||||
|
@ -176,6 +176,22 @@ namespace ZeroTier {
|
||||
prometheus::simpleapi::counter_family_t network_outgoing_packets
|
||||
{ "zt_network_outgoing_packets", "number of outgoing packets per network" };
|
||||
|
||||
// PeerMetrics
|
||||
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency =
|
||||
prometheus::Builder<prometheus::Histogram<uint64_t>>()
|
||||
.Name("zt_peer_latency")
|
||||
.Help("peer latency (ms)")
|
||||
.Register(prometheus::simpleapi::registry);
|
||||
|
||||
prometheus::simpleapi::gauge_family_t peer_path_count
|
||||
{ "zt_peer_path_count", "number of paths to peer" };
|
||||
prometheus::simpleapi::counter_family_t peer_incoming_packets
|
||||
{ "zt_peer_incoming_packets", "number of incoming packets from a peer" };
|
||||
prometheus::simpleapi::counter_family_t peer_outgoing_packets
|
||||
{ "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
|
||||
prometheus::simpleapi::counter_family_t peer_packet_errors
|
||||
{ "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
|
||||
|
||||
// General Controller Metrics
|
||||
prometheus::simpleapi::gauge_metric_t network_count
|
||||
{"controller_network_count", "number of networks the controller is serving"};
|
||||
|
@ -107,6 +107,13 @@ namespace ZeroTier {
|
||||
extern prometheus::simpleapi::counter_family_t network_incoming_packets;
|
||||
extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
|
||||
|
||||
// Peer Metrics
|
||||
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
|
||||
extern prometheus::simpleapi::gauge_family_t peer_path_count;
|
||||
extern prometheus::simpleapi::counter_family_t peer_incoming_packets;
|
||||
extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
|
||||
extern prometheus::simpleapi::counter_family_t peer_packet_errors;
|
||||
|
||||
// General Controller Metrics
|
||||
extern prometheus::simpleapi::gauge_metric_t network_count;
|
||||
extern prometheus::simpleapi::gauge_metric_t member_count;
|
||||
|
107
node/Peer.cpp
107
node/Peer.cpp
@ -28,11 +28,6 @@ namespace ZeroTier {
|
||||
|
||||
static unsigned char s_freeRandomByteCounter = 0;
|
||||
|
||||
char * peerIDString(const Identity &id) {
|
||||
char out[16];
|
||||
return id.address().toString(out);
|
||||
}
|
||||
|
||||
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) :
|
||||
RR(renv),
|
||||
_lastReceive(0),
|
||||
@ -55,7 +50,13 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
|
||||
_directPathPushCutoffCount(0),
|
||||
_echoRequestCutoffCount(0),
|
||||
_localMultipathSupported(false),
|
||||
_lastComputedAggregateMeanLatency(0)
|
||||
_lastComputedAggregateMeanLatency(0),
|
||||
_peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})},
|
||||
_alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})},
|
||||
_dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})},
|
||||
_incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
|
||||
_outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
|
||||
_packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
|
||||
{
|
||||
if (!myIdentity.agree(peerIdentity,_key)) {
|
||||
throw ZT_EXCEPTION_INVALID_ARGUMENT;
|
||||
@ -96,7 +97,7 @@ void Peer::received(
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
_incoming_packet++;
|
||||
recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
|
||||
|
||||
if (trustEstablished) {
|
||||
@ -519,54 +520,70 @@ void Peer::performMultipathStateCheck(void *tPtr, int64_t now)
|
||||
unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
|
||||
{
|
||||
unsigned int sent = 0;
|
||||
Mutex::Lock _l(_paths_m);
|
||||
{
|
||||
Mutex::Lock _l(_paths_m);
|
||||
|
||||
performMultipathStateCheck(tPtr, now);
|
||||
performMultipathStateCheck(tPtr, now);
|
||||
|
||||
const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
|
||||
if (sendFullHello) {
|
||||
_lastSentFullHello = now;
|
||||
}
|
||||
|
||||
// Right now we only keep pinging links that have the maximum priority. The
|
||||
// priority is used to track cluster redirections, meaning that when a cluster
|
||||
// redirects us its redirect target links override all other links and we
|
||||
// let those old links expire.
|
||||
long maxPriority = 0;
|
||||
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
|
||||
if (_paths[i].p) {
|
||||
maxPriority = std::max(_paths[i].priority,maxPriority);
|
||||
} else {
|
||||
break;
|
||||
const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
|
||||
if (sendFullHello) {
|
||||
_lastSentFullHello = now;
|
||||
}
|
||||
}
|
||||
|
||||
bool deletionOccurred = false;
|
||||
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
|
||||
if (_paths[i].p) {
|
||||
// Clean expired and reduced priority paths
|
||||
if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
|
||||
if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
|
||||
attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
|
||||
_paths[i].p->sent(now);
|
||||
sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
|
||||
}
|
||||
// Right now we only keep pinging links that have the maximum priority. The
|
||||
// priority is used to track cluster redirections, meaning that when a cluster
|
||||
// redirects us its redirect target links override all other links and we
|
||||
// let those old links expire.
|
||||
long maxPriority = 0;
|
||||
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
|
||||
if (_paths[i].p) {
|
||||
maxPriority = std::max(_paths[i].priority,maxPriority);
|
||||
} else {
|
||||
_paths[i] = _PeerPath();
|
||||
deletionOccurred = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!_paths[i].p || deletionOccurred) {
|
||||
for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
|
||||
if (_paths[j].p && i != j) {
|
||||
_paths[i] = _paths[j];
|
||||
_paths[j] = _PeerPath();
|
||||
break;
|
||||
|
||||
bool deletionOccurred = false;
|
||||
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
|
||||
if (_paths[i].p) {
|
||||
// Clean expired and reduced priority paths
|
||||
if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
|
||||
if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
|
||||
attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
|
||||
_paths[i].p->sent(now);
|
||||
sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
|
||||
}
|
||||
} else {
|
||||
_paths[i] = _PeerPath();
|
||||
deletionOccurred = true;
|
||||
}
|
||||
}
|
||||
deletionOccurred = false;
|
||||
if (!_paths[i].p || deletionOccurred) {
|
||||
for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
|
||||
if (_paths[j].p && i != j) {
|
||||
_paths[i] = _paths[j];
|
||||
_paths[j] = _PeerPath();
|
||||
break;
|
||||
}
|
||||
}
|
||||
deletionOccurred = false;
|
||||
}
|
||||
}
|
||||
uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
|
||||
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
|
||||
if (_paths[i].p) {
|
||||
if (_paths[i].p->alive(now)) {
|
||||
alive_path_count_tmp++;
|
||||
}
|
||||
else {
|
||||
dead_path_count_tmp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
_alive_path_count = alive_path_count_tmp;
|
||||
_dead_path_count = dead_path_count_tmp;
|
||||
}
|
||||
_peer_latency.Observe(latency(now));
|
||||
return sent;
|
||||
}
|
||||
|
||||
@ -641,6 +658,7 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
|
||||
void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
|
||||
uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
|
||||
{
|
||||
_outgoing_packet++;
|
||||
if (_localMultipathSupported && _bond) {
|
||||
_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
|
||||
}
|
||||
@ -648,6 +666,7 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
|
||||
|
||||
void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
|
||||
{
|
||||
_packet_errors++;
|
||||
if (_localMultipathSupported && _bond) {
|
||||
_bond->recordIncomingInvalidPacket(path);
|
||||
}
|
||||
|
@ -598,6 +598,13 @@ private:
|
||||
int32_t _lastComputedAggregateMeanLatency;
|
||||
|
||||
SharedPtr<Bond> _bond;
|
||||
|
||||
prometheus::Histogram<uint64_t> &_peer_latency;
|
||||
prometheus::simpleapi::gauge_metric_t _alive_path_count;
|
||||
prometheus::simpleapi::gauge_metric_t _dead_path_count;
|
||||
prometheus::simpleapi::counter_metric_t _incoming_packet;
|
||||
prometheus::simpleapi::counter_metric_t _outgoing_packet;
|
||||
prometheus::simpleapi::counter_metric_t _packet_errors;
|
||||
};
|
||||
|
||||
} // namespace ZeroTier
|
||||
|
Loading…
Reference in New Issue
Block a user