Metrics consolidation (#1997)

* Rename zt_packet_incoming -> zt_packet

Also consolidate zt_peer_packets into a single metric with tx and rx labels.  Same for ztc_tcp_data and ztc_udp_data

* Further collapse tcp & udp into metric labels for zt_data

* Fix zt_data metric description

* zt_peer_packets description fix

* Consolidate incoming/outgoing network packets to a single metric

* zt_incoming_packet_error -> zt_packet_error

* Disable peer metrics for central controllers

Can change in the future if needed, but given the traffic our controllers serve, that's going to be a *lot* of data

* Disable peer metrics for controllers pt 2
This commit is contained in:
Grant Limberg 2023-05-04 11:12:55 -07:00 committed by GitHub
parent 74dc41c7c7
commit 00d55fc4b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 75 additions and 60 deletions

View File

@ -311,7 +311,7 @@ endif
ifeq ($(ZT_CONTROLLER),1) ifeq ($(ZT_CONTROLLER),1)
override CXXFLAGS+=-Wall -Wno-deprecated -std=c++17 -pthread $(INCLUDES) -DNDEBUG $(DEFS) override CXXFLAGS+=-Wall -Wno-deprecated -std=c++17 -pthread $(INCLUDES) -DNDEBUG $(DEFS)
override LDLIBS+=-Lext/libpqxx-7.7.3/install/ubuntu22.04/lib -lpqxx -lpq ext/hiredis-1.0.2/lib/ubuntu22.04/libhiredis.a ext/redis-plus-plus-1.3.3/install/ubuntu22.04/lib/libredis++.a -lssl -lcrypto override LDLIBS+=-Lext/libpqxx-7.7.3/install/ubuntu22.04/lib -lpqxx -lpq ext/hiredis-1.0.2/lib/ubuntu22.04/libhiredis.a ext/redis-plus-plus-1.3.3/install/ubuntu22.04/lib/libredis++.a -lssl -lcrypto
override DEFS+=-DZT_CONTROLLER_USE_LIBPQ override DEFS+=-DZT_CONTROLLER_USE_LIBPQ -DZT_NO_PEER_METRICS
override INCLUDES+=-I/usr/include/postgresql -Iext/libpqxx-7.7.3/install/ubuntu22.04/include -Iext/hiredis-1.0.2/include/ -Iext/redis-plus-plus-1.3.3/install/ubuntu22.04/include/sw/ override INCLUDES+=-I/usr/include/postgresql -Iext/libpqxx-7.7.3/install/ubuntu22.04/include -Iext/hiredis-1.0.2/include/ -Iext/redis-plus-plus-1.3.3/install/ubuntu22.04/include/sw/
endif endif

View File

@ -25,7 +25,7 @@ namespace ZeroTier {
namespace Metrics { namespace Metrics {
// Packet Type Counts // Packet Type Counts
prometheus::simpleapi::counter_family_t packets prometheus::simpleapi::counter_family_t packets
{ "zt_packet_incoming", "incoming packet type counts"}; { "zt_packet", "incoming packet type counts"};
// Incoming packets // Incoming packets
prometheus::simpleapi::counter_metric_t pkt_nop_in prometheus::simpleapi::counter_metric_t pkt_nop_in
@ -118,7 +118,7 @@ namespace ZeroTier {
// Packet Error Counts // Packet Error Counts
prometheus::simpleapi::counter_family_t packet_errors prometheus::simpleapi::counter_family_t packet_errors
{ "zt_packet_incoming_error", "incoming packet errors"}; { "zt_packet_error", "incoming packet errors"};
// Incoming Error Counts // Incoming Error Counts
prometheus::simpleapi::counter_metric_t pkt_error_obj_not_found_in prometheus::simpleapi::counter_metric_t pkt_error_obj_not_found_in
@ -157,25 +157,26 @@ namespace ZeroTier {
{ packet_errors.Add({{"error_type", "internal_server_error"}, {"direction", "tx"}}) }; { packet_errors.Add({{"error_type", "internal_server_error"}, {"direction", "tx"}}) };
// Data Sent/Received Metrics // Data Sent/Received Metrics
prometheus::simpleapi::counter_metric_t udp_send prometheus::simpleapi::counter_family_t data
{ "zt_udp_data_sent", "number of bytes ZeroTier has sent via UDP" }; { "zt_data", "number of bytes ZeroTier has transmitted or received" };
prometheus::simpleapi::counter_metric_t udp_recv prometheus::simpleapi::counter_metric_t udp_recv
{ "zt_udp_data_recv", "number of bytes ZeroTier has received via UDP" }; { data.Add({{"protocol","udp"},{"direction","rx"}}) };
prometheus::simpleapi::counter_metric_t udp_send
{ data.Add({{"protocol","udp"},{"direction","tx"}}) };
prometheus::simpleapi::counter_metric_t tcp_send prometheus::simpleapi::counter_metric_t tcp_send
{ "zt_tcp_data_sent", "number of bytes ZeroTier has sent via TCP" }; { data.Add({{"protocol","tcp"},{"direction", "tx"}}) };
prometheus::simpleapi::counter_metric_t tcp_recv prometheus::simpleapi::counter_metric_t tcp_recv
{ "zt_tcp_data_recv", "number of bytes ZeroTier has received via TCP" }; { data.Add({{"protocol","tcp"},{"direction", "rx"}}) };
// Network Metrics // Network Metrics
prometheus::simpleapi::gauge_metric_t network_num_joined prometheus::simpleapi::gauge_metric_t network_num_joined
{ "zt_num_networks", "number of networks this instance is joined to" }; { "zt_num_networks", "number of networks this instance is joined to" };
prometheus::simpleapi::gauge_family_t network_num_multicast_groups prometheus::simpleapi::gauge_family_t network_num_multicast_groups
{ "zt_network_multcast_groups_subscribed", "number of multicast groups networks are subscribed to" }; { "zt_network_multicast_groups_subscribed", "number of multicast groups networks are subscribed to" };
prometheus::simpleapi::counter_family_t network_incoming_packets prometheus::simpleapi::counter_family_t network_packets
{ "zt_network_incoming_packets", "number of incoming packets per network" }; { "zt_network_packets", "number of incoming/outgoing packets per network" };
prometheus::simpleapi::counter_family_t network_outgoing_packets
{ "zt_network_outgoing_packets", "number of outgoing packets per network" }; #ifndef ZT_NO_PEER_METRICS
// PeerMetrics // PeerMetrics
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency = prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency =
prometheus::Builder<prometheus::Histogram<uint64_t>>() prometheus::Builder<prometheus::Histogram<uint64_t>>()
@ -185,12 +186,11 @@ namespace ZeroTier {
prometheus::simpleapi::gauge_family_t peer_path_count prometheus::simpleapi::gauge_family_t peer_path_count
{ "zt_peer_path_count", "number of paths to peer" }; { "zt_peer_path_count", "number of paths to peer" };
prometheus::simpleapi::counter_family_t peer_incoming_packets prometheus::simpleapi::counter_family_t peer_packets
{ "zt_peer_incoming_packets", "number of incoming packets from a peer" }; { "zt_peer_packets", "number of packets to/from a peer" };
prometheus::simpleapi::counter_family_t peer_outgoing_packets
{ "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
prometheus::simpleapi::counter_family_t peer_packet_errors prometheus::simpleapi::counter_family_t peer_packet_errors
{ "zt_peer_packet_errors" , "number of incoming packet errors from a peer" }; { "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
#endif
// General Controller Metrics // General Controller Metrics
prometheus::simpleapi::gauge_metric_t network_count prometheus::simpleapi::gauge_metric_t network_count

View File

@ -96,23 +96,24 @@ namespace ZeroTier {
extern prometheus::simpleapi::counter_metric_t pkt_error_internal_server_error_out; extern prometheus::simpleapi::counter_metric_t pkt_error_internal_server_error_out;
// Data Sent/Received Metrics // Data Sent/Received Metrics
extern prometheus::simpleapi::counter_family_t data;
extern prometheus::simpleapi::counter_metric_t udp_send; extern prometheus::simpleapi::counter_metric_t udp_send;
extern prometheus::simpleapi::counter_metric_t udp_recv; extern prometheus::simpleapi::counter_metric_t udp_recv;
extern prometheus::simpleapi::counter_metric_t tcp_send; extern prometheus::simpleapi::counter_metric_t tcp_send;
extern prometheus::simpleapi::counter_metric_t tcp_recv; extern prometheus::simpleapi::counter_metric_t tcp_recv;
// Network Metrics // Network Metrics
extern prometheus::simpleapi::gauge_metric_t network_num_joined; extern prometheus::simpleapi::gauge_metric_t network_num_joined;
extern prometheus::simpleapi::gauge_family_t network_num_multicast_groups; extern prometheus::simpleapi::gauge_family_t network_num_multicast_groups;
extern prometheus::simpleapi::counter_family_t network_incoming_packets; extern prometheus::simpleapi::counter_family_t network_packets;
extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
#ifndef ZT_NO_PEER_METRICS
// Peer Metrics // Peer Metrics
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency; extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
extern prometheus::simpleapi::gauge_family_t peer_path_count; extern prometheus::simpleapi::gauge_family_t peer_path_count;
extern prometheus::simpleapi::counter_family_t peer_incoming_packets; extern prometheus::simpleapi::counter_family_t peer_packets;
extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
extern prometheus::simpleapi::counter_family_t peer_packet_errors; extern prometheus::simpleapi::counter_family_t peer_packet_errors;
#endif
// General Controller Metrics // General Controller Metrics
extern prometheus::simpleapi::gauge_metric_t network_count; extern prometheus::simpleapi::gauge_metric_t network_count;

View File

@ -569,10 +569,10 @@ Network::Network(const RuntimeEnvironment *renv,void *tPtr,uint64_t nwid,void *u
_netconfFailure(NETCONF_FAILURE_NONE), _netconfFailure(NETCONF_FAILURE_NONE),
_portError(0), _portError(0),
_num_multicast_groups{Metrics::network_num_multicast_groups.Add({{"network_id", _nwidStr}})}, _num_multicast_groups{Metrics::network_num_multicast_groups.Add({{"network_id", _nwidStr}})},
_incoming_packets_accpeted{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})}, _incoming_packets_accepted{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","yes"}})},
_incoming_packets_dropped{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})}, _incoming_packets_dropped{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","no"}})},
_outgoing_packets_accepted{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})}, _outgoing_packets_accepted{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","yes"}})},
_outgoing_packets_dropped{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})} _outgoing_packets_dropped{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","no"}})}
{ {
for(int i=0;i<ZT_NETWORK_MAX_INCOMING_UPDATES;++i) { for(int i=0;i<ZT_NETWORK_MAX_INCOMING_UPDATES;++i) {
_incomingConfigChunks[i].ts = 0; _incomingConfigChunks[i].ts = 0;
@ -837,7 +837,7 @@ int Network::filterIncomingPacket(
} }
if (accept) { if (accept) {
_incoming_packets_accpeted++; _incoming_packets_accepted++;
if (cc) { if (cc) {
Packet outp(cc,RR->identity.address(),Packet::VERB_EXT_FRAME); Packet outp(cc,RR->identity.address(),Packet::VERB_EXT_FRAME);
outp.append(_id); outp.append(_id);

View File

@ -483,7 +483,7 @@ private:
AtomicCounter __refCount; AtomicCounter __refCount;
prometheus::simpleapi::gauge_metric_t _num_multicast_groups; prometheus::simpleapi::gauge_metric_t _num_multicast_groups;
prometheus::simpleapi::counter_metric_t _incoming_packets_accpeted; prometheus::simpleapi::counter_metric_t _incoming_packets_accepted;
prometheus::simpleapi::counter_metric_t _incoming_packets_dropped; prometheus::simpleapi::counter_metric_t _incoming_packets_dropped;
prometheus::simpleapi::counter_metric_t _outgoing_packets_accepted; prometheus::simpleapi::counter_metric_t _outgoing_packets_accepted;
prometheus::simpleapi::counter_metric_t _outgoing_packets_dropped; prometheus::simpleapi::counter_metric_t _outgoing_packets_dropped;

View File

@ -28,35 +28,37 @@ namespace ZeroTier {
static unsigned char s_freeRandomByteCounter = 0; static unsigned char s_freeRandomByteCounter = 0;
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) : Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity)
RR(renv), : RR(renv)
_lastReceive(0), , _lastReceive(0)
_lastNontrivialReceive(0), , _lastNontrivialReceive(0)
_lastTriedMemorizedPath(0), , _lastTriedMemorizedPath(0)
_lastDirectPathPushSent(0), , _lastDirectPathPushSent(0)
_lastDirectPathPushReceive(0), , _lastDirectPathPushReceive(0)
_lastCredentialRequestSent(0), , _lastCredentialRequestSent(0)
_lastWhoisRequestReceived(0), , _lastWhoisRequestReceived(0)
_lastCredentialsReceived(0), , _lastCredentialsReceived(0)
_lastTrustEstablishedPacketReceived(0), , _lastTrustEstablishedPacketReceived(0)
_lastSentFullHello(0), , _lastSentFullHello(0)
_lastEchoCheck(0), , _lastEchoCheck(0)
_freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter), , _freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter)
_vProto(0), , _vProto(0)
_vMajor(0), , _vMajor(0)
_vMinor(0), , _vMinor(0)
_vRevision(0), , _vRevision(0)
_id(peerIdentity), , _id(peerIdentity)
_directPathPushCutoffCount(0), , _directPathPushCutoffCount(0)
_echoRequestCutoffCount(0), , _echoRequestCutoffCount(0)
_localMultipathSupported(false), , _localMultipathSupported(false)
_lastComputedAggregateMeanLatency(0), , _lastComputedAggregateMeanLatency(0)
_peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})}, #ifndef ZT_NO_PEER_METRICS
_alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})}, , _peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})}
_dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})}, , _alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})}
_incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}, , _dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})}
_outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}, , _incoming_packet{Metrics::peer_packets.Add({{"direction", "rx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
_packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})} , _outgoing_packet{Metrics::peer_packets.Add({{"direction", "tx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
, _packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
#endif
{ {
if (!myIdentity.agree(peerIdentity,_key)) { if (!myIdentity.agree(peerIdentity,_key)) {
throw ZT_EXCEPTION_INVALID_ARGUMENT; throw ZT_EXCEPTION_INVALID_ARGUMENT;
@ -97,7 +99,9 @@ void Peer::received(
default: default:
break; break;
} }
#ifndef ZT_NO_PEER_METRICS
_incoming_packet++; _incoming_packet++;
#endif
recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now); recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
if (trustEstablished) { if (trustEstablished) {
@ -569,6 +573,7 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
deletionOccurred = false; deletionOccurred = false;
} }
} }
#ifndef ZT_NO_PEER_METRICS
uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0; uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) { if (_paths[i].p) {
@ -582,8 +587,11 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
} }
_alive_path_count = alive_path_count_tmp; _alive_path_count = alive_path_count_tmp;
_dead_path_count = dead_path_count_tmp; _dead_path_count = dead_path_count_tmp;
#endif
} }
#ifndef ZT_NO_PEER_METRICS
_peer_latency.Observe(latency(now)); _peer_latency.Observe(latency(now));
#endif
return sent; return sent;
} }
@ -658,7 +666,9 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId, void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now) uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
{ {
#ifndef ZT_NO_PEER_METRICS
_outgoing_packet++; _outgoing_packet++;
#endif
if (_localMultipathSupported && _bond) { if (_localMultipathSupported && _bond) {
_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now); _bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
} }
@ -666,7 +676,9 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path) void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
{ {
#ifndef ZT_NO_PEER_METRICS
_packet_errors++; _packet_errors++;
#endif
if (_localMultipathSupported && _bond) { if (_localMultipathSupported && _bond) {
_bond->recordIncomingInvalidPacket(path); _bond->recordIncomingInvalidPacket(path);
} }

View File

@ -599,12 +599,14 @@ private:
SharedPtr<Bond> _bond; SharedPtr<Bond> _bond;
#ifndef ZT_NO_PEER_METRICS
prometheus::Histogram<uint64_t> &_peer_latency; prometheus::Histogram<uint64_t> &_peer_latency;
prometheus::simpleapi::gauge_metric_t _alive_path_count; prometheus::simpleapi::gauge_metric_t _alive_path_count;
prometheus::simpleapi::gauge_metric_t _dead_path_count; prometheus::simpleapi::gauge_metric_t _dead_path_count;
prometheus::simpleapi::counter_metric_t _incoming_packet; prometheus::simpleapi::counter_metric_t _incoming_packet;
prometheus::simpleapi::counter_metric_t _outgoing_packet; prometheus::simpleapi::counter_metric_t _outgoing_packet;
prometheus::simpleapi::counter_metric_t _packet_errors; prometheus::simpleapi::counter_metric_t _packet_errors;
#endif
}; };
} // namespace ZeroTier } // namespace ZeroTier