Fix for ab-mode link failure, QoS metrics (WIP)

This commit is contained in:
Joseph Henry 2022-06-08 10:01:54 -07:00
parent 75652ce667
commit 7ed5bde426
No known key found for this signature in database
GPG Key ID: C45B33FF5EBC9344
5 changed files with 177 additions and 16 deletions

View File

@ -407,8 +407,9 @@ void Bond::recordOutgoingPacket(const SharedPtr<Path>& path, uint64_t packetId,
_lastFrame = now;
}
if (shouldRecord) {
//_paths[pathIdx].expectingAckAsOf = now;
//_paths[pathIdx].totalBytesSentSinceLastAckRecieved += payloadLength;
//_paths[pathIdx].unackedBytes += payloadLength;
// Take note that we're expecting a VERB_ACK on this path as of a specific time
if (_paths[pathIdx].qosStatsOut.size() < ZT_QOS_MAX_PENDING_RECORDS) {
_paths[pathIdx].qosStatsOut[packetId] = now;
}
@ -443,10 +444,24 @@ void Bond::recordIncomingPacket(const SharedPtr<Path>& path, uint64_t packetId,
}
if (shouldRecord) {
if (_paths[pathIdx].qosStatsIn.size() < ZT_QOS_MAX_PENDING_RECORDS) {
// debug("Recording QoS information (table size = %d)", _paths[pathIdx].qosStatsIn.size());
_paths[pathIdx].qosStatsIn[packetId] = now;
++(_paths[pathIdx].packetsReceivedSinceLastQoS);
//_paths[pathIdx].packetValiditySamples.push(true);
}
else {
debug("QoS buffer full, will not record information");
}
/*
if (_paths[pathIdx].ackStatsIn.size() < ZT_ACK_MAX_PENDING_RECORDS) {
//debug("Recording ACK information (table size = %d)", _paths[pathIdx].ackStatsIn.size());
_paths[pathIdx].ackStatsIn[packetId] = payloadLength;
++(_paths[pathIdx].packetsReceivedSinceLastAck);
}
else {
debug("ACK buffer full, will not record information");
}
*/
}
}
}
@ -491,6 +506,16 @@ void Bond::receivedQoS(const SharedPtr<Path>& path, int64_t now, int count, uint
_paths[pathIdx].qosRecordSize.push(count);
}
void Bond::receivedAck(int pathIdx, int64_t now, int32_t ackedBytes)
{
/*
Mutex::Lock _l(_paths_m);
debug("received ACK of %d bytes on path %s, there are still %d un-acked bytes", ackedBytes, pathToStr(_paths[pathIdx].p).c_str(), _paths[pathIdx].unackedBytes);
_paths[pathIdx].lastAckReceived = now;
_paths[pathIdx].unackedBytes = (ackedBytes > _paths[pathIdx].unackedBytes) ? 0 : _paths[pathIdx].unackedBytes - ackedBytes;
*/
}
int32_t Bond::generateQoSPacket(int pathIdx, int64_t now, char* qosBuffer)
{
int32_t len = 0;
@ -743,12 +768,38 @@ void Bond::sendPATH_NEGOTIATION_REQUEST(void* tPtr, int pathIdx)
}
}
void Bond::sendACK(void* tPtr, int pathIdx, int64_t localSocket, const InetAddress& atAddress, int64_t now)
{
/*
Packet outp(_peer->_id.address(), RR->identity.address(), Packet::VERB_ACK);
int32_t bytesToAck = 0;
std::map<uint64_t, uint64_t>::iterator it = _paths[pathIdx].ackStatsIn.begin();
while (it != _paths[pathIdx].ackStatsIn.end()) {
bytesToAck += it->second;
++it;
}
debug("sending ACK of %d bytes on path %s (table size = %d)", bytesToAck, pathToStr(_paths[pathIdx].p).c_str(), _paths[pathIdx].ackStatsIn.size());
outp.append<uint32_t>(bytesToAck);
if (atAddress) {
outp.armor(_peer->key(), false, _peer->aesKeysIfSupported());
RR->node->putPacket(tPtr, localSocket, atAddress, outp.data(), outp.size());
}
else {
RR->sw->send(tPtr, outp, false);
}
_paths[pathIdx].ackStatsIn.clear();
_paths[pathIdx].packetsReceivedSinceLastAck = 0;
_paths[pathIdx].lastAckSent = now;
*/
}
void Bond::sendQOS_MEASUREMENT(void* tPtr, int pathIdx, int64_t localSocket, const InetAddress& atAddress, int64_t now)
{
int64_t _now = RR->node->now();
Packet outp(_peer->_id.address(), RR->identity.address(), Packet::VERB_QOS_MEASUREMENT);
char qosData[ZT_QOS_MAX_PACKET_SIZE];
int16_t len = generateQoSPacket(pathIdx, _now, qosData);
// debug("sending QOS via link %s (len=%d)", pathToStr(_paths[pathIdx].p).c_str(), len);
if (len) {
outp.append(qosData, len);
if (atAddress) {
@ -762,7 +813,6 @@ void Bond::sendQOS_MEASUREMENT(void* tPtr, int pathIdx, int64_t localSocket, con
_paths[pathIdx].lastQoSMeasurement = now;
_overheadBytes += outp.size();
}
// debug("send QOS via link %s (len=%d)", pathToStr(_paths[pathIdx].p).c_str(), len);
}
void Bond::processBackgroundBondTasks(void* tPtr, int64_t now)
@ -799,6 +849,12 @@ void Bond::processBackgroundBondTasks(void* tPtr, int64_t now)
if (_paths[i].needsToSendQoS(now, _qosSendInterval)) {
sendQOS_MEASUREMENT(tPtr, i, _paths[i].p->localSocket(), _paths[i].p->address(), now);
}
// ACK
/*
if (_paths[i].needsToSendAck(now, _ackSendInterval)) {
sendACK(tPtr, i, _paths[i].p->localSocket(), _paths[i].p->address(), now);
}
*/
}
}
}
@ -1095,6 +1151,20 @@ void Bond::estimatePathQuality(int64_t now)
log("Dropped %d QOS out-records", numDroppedQosOutRecords);
}
/*
for (unsigned int i = 0; i < ZT_MAX_PEER_NETWORK_PATHS; ++i) {
if (! _paths[i].p) {
continue;
}
// if ((now - _paths[i].lastAckReceived) > ackSendInterval) {
// debug("been a while since ACK");
// if (_paths[i].unackedBytes > 0) {
// _paths[i].unackedBytes / _paths[i].bytesSen
// }
// }
}
*/
it = _paths[i].qosStatsIn.begin();
int numDroppedQosInRecords = 0;
while (it != _paths[i].qosStatsIn.end()) {
@ -1238,6 +1308,7 @@ void Bond::dequeueNextActiveBackupPath(uint64_t now)
bool Bond::abForciblyRotateLink()
{
Mutex::Lock _l(_paths_m);
if (_policy == ZT_BOND_POLICY_ACTIVE_BACKUP) {
int prevPathIdx = _abPathIdx;
dequeueNextActiveBackupPath(RR->node->now());
@ -1366,12 +1437,18 @@ void Bond::processActiveBackupTasks(void* tPtr, int64_t now)
// Remove ineligible paths from the failover link queue
for (std::deque<int>::iterator it(_abFailoverQueue.begin()); it != _abFailoverQueue.end();) {
if (! _paths[(*it)].p) {
log("link is no longer valid, removing from failover queue (%zu links remain in queue)", _abFailoverQueue.size());
it = _abFailoverQueue.erase(it);
continue;
}
if (_paths[(*it)].p && ! _paths[(*it)].eligible) {
SharedPtr<Link> link = RR->bc->getLinkBySocket(_policyAlias, _paths[(*it)].p->localSocket());
it = _abFailoverQueue.erase(it);
if (link) {
log("link %s is ineligible, removing from failover queue (%zu links in queue)", pathToStr(_paths[_abPathIdx].p).c_str(), _abFailoverQueue.size());
log("link %s is ineligible, removing from failover queue (%zu links remain in queue)", pathToStr(_paths[_abPathIdx].p).c_str(), _abFailoverQueue.size());
}
continue;
}
else {
++it;
@ -1533,8 +1610,17 @@ void Bond::processActiveBackupTasks(void* tPtr, int64_t now)
if (prevActiveBackupPathIdx != _abPathIdx) {
_lastActiveBackupPathChange = now;
}
if (_abFailoverQueue.empty()) {
return; // No sense in continuing since there are no links to switch to
}
if (_abLinkSelectMethod == ZT_BOND_RESELECTION_POLICY_ALWAYS) {
SharedPtr<Link> abLink = getLink(_paths[_abPathIdx].p);
if (! _paths[_abFailoverQueue.front()].p) {
log("invalid link. not switching");
return;
}
SharedPtr<Link> abFailoverLink = getLink(_paths[_abFailoverQueue.front()].p);
if (abLink && ! abLink->primary() && _paths[_abFailoverQueue.front()].p && abFailoverLink && abFailoverLink->primary()) {
dequeueNextActiveBackupPath(now);
@ -1589,6 +1675,7 @@ void Bond::initTimers()
_lastPathNegotiationCheck = 0;
_lastPathNegotiationReceived = 0;
_lastQoSRateCheck = 0;
_lastAckRateCheck = 0;
_lastQualityEstimation = 0;
_lastBondStatusLog = 0;
_lastSummaryDump = 0;
@ -1621,10 +1708,6 @@ void Bond::setBondParameters(int policy, SharedPtr<Bond> templateBond, bool useT
_localUtility = 0;
_negotiatedPathIdx = 0;
// QOS Verb (and related checks)
_qosCutoffCount = 0;
// User preferences which may override the default bonding algorithm's behavior
_userHasSpecifiedPrimaryLink = false;
@ -1717,7 +1800,9 @@ void Bond::setBondParameters(int policy, SharedPtr<Bond> templateBond, bool useT
_monitorInterval = _failoverInterval / ZT_BOND_ECHOS_PER_FAILOVER_INTERVAL;
_qualityEstimationInterval = _failoverInterval * 2;
_qosSendInterval = _failoverInterval * 2;
_ackSendInterval = _failoverInterval * 2;
_qosCutoffCount = 0;
_ackCutoffCount = 0;
_defaultPathRefractoryPeriod = 8000;
}
@ -1736,7 +1821,7 @@ void Bond::setUserQualityWeights(float weights[], int len)
SharedPtr<Link> Bond::getLink(const SharedPtr<Path>& path)
{
return RR->bc->getLinkBySocket(_policyAlias, path->localSocket());
return ! path ? SharedPtr<Link>() : RR->bc->getLinkBySocket(_policyAlias, path->localSocket());
}
std::string Bond::pathToStr(const SharedPtr<Path>& path)

View File

@ -638,6 +638,15 @@ class Bond {
*/
void receivedQoS(const SharedPtr<Path>& path, int64_t now, int count, uint64_t* rx_id, uint16_t* rx_ts);
/**
* Process the contents of an inbound VERB_ACK to gather path quality observations.
*
* @param pathIdx Path over which packet was received
* @param now Current time
* @param ackedBytes Number of bytes ACKed by this VERB_ACK
*/
void receivedAck(int pathIdx, int64_t now, int32_t ackedBytes);
/**
* Generate the contents of a VERB_QOS_MEASUREMENT packet.
*
@ -879,6 +888,26 @@ class Bond {
*/
void processBackgroundBondTasks(void* tPtr, int64_t now);
/**
* Rate limit gate for VERB_ACK
*
* @param now Current time
* @return Whether the incoming packet should be rate-gated
*/
inline bool rateGateACK(const int64_t now)
{
_ackCutoffCount++;
int numToDrain = _lastAckRateCheck ? (now - _lastAckRateCheck) / ZT_ACK_DRAINAGE_DIVISOR : _ackCutoffCount;
_lastAckRateCheck = now;
if (_ackCutoffCount > numToDrain) {
_ackCutoffCount -= numToDrain;
}
else {
_ackCutoffCount = 0;
}
return (_ackCutoffCount < ZT_ACK_CUTOFF_LIMIT);
}
/**
* Rate limit gate for VERB_QOS_MEASUREMENT
*
@ -1204,7 +1233,11 @@ class Bond {
private:
struct NominatedPath {
NominatedPath()
: lastQoSMeasurement(0)
: lastAckSent(0)
, lastAckReceived(0)
, unackedBytes(0)
, packetsReceivedSinceLastAck(0)
, lastQoSMeasurement(0)
, lastThroughputEstimation(0)
, lastRefractoryUpdate(0)
, lastAliveToggle(0)
@ -1295,6 +1328,15 @@ class Bond {
return ((packetsReceivedSinceLastQoS >= ZT_QOS_TABLE_SIZE) || ((now - lastQoSMeasurement) > qosSendInterval)) && packetsReceivedSinceLastQoS;
}
/**
* @param now Current time
* @return Whether an ACK (VERB_ACK) packet needs to be emitted at this time
*/
inline bool needsToSendAck(int64_t now, int ackSendInterval)
{
return ((now - lastAckSent) >= ackSendInterval || (packetsReceivedSinceLastAck == ZT_QOS_TABLE_SIZE)) && packetsReceivedSinceLastAck;
}
/**
* Reset packet counters
*/
@ -1306,6 +1348,7 @@ class Bond {
std::map<uint64_t, uint64_t> qosStatsOut; // id:egress_time
std::map<uint64_t, uint64_t> qosStatsIn; // id:now
std::map<uint64_t, uint64_t> ackStatsIn; // id:now
RingBuffer<int, ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> qosRecordSize;
RingBuffer<float, ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> qosRecordLossSamples;
@ -1314,6 +1357,11 @@ class Bond {
RingBuffer<float, ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> throughputVarianceSamples;
RingBuffer<uint16_t, ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> latencySamples;
uint64_t lastAckSent;
uint64_t lastAckReceived;
uint64_t unackedBytes;
uint64_t packetsReceivedSinceLastAck;
uint64_t lastQoSMeasurement; // Last time that a VERB_QOS_MEASUREMENT was sent out on this path.
uint64_t lastThroughputEstimation; // Last time that the path's throughput was estimated.
uint64_t lastRefractoryUpdate; // The last time that the refractory period was updated.
@ -1513,7 +1561,9 @@ class Bond {
* Rate-limiting
*/
uint16_t _qosCutoffCount;
uint16_t _ackCutoffCount;
uint64_t _lastQoSRateCheck;
uint64_t _lastAckRateCheck;
uint16_t _pathNegotiationCutoffCount;
uint64_t _lastPathNegotiationReceived;

View File

@ -372,6 +372,11 @@
*/
#define ZT_QOS_TABLE_SIZE ((ZT_QOS_MAX_PACKET_SIZE * 8) / (64 + 16))
/**
* Maximum number of packets we monitor for ACK information at any given time
*/
#define ZT_ACK_MAX_PENDING_RECORDS (32 * 1024)
/**
* Maximum number of packets we monitor for QoS information at any given time
*/

View File

@ -88,6 +88,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr,int32_t f
peer->received(tPtr,_path,hops(),packetId(),payloadLength(),v,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
break;
case Packet::VERB_HELLO: r = _doHELLO(RR,tPtr,true); break;
case Packet::VERB_ACK : r = _doACK(RR,tPtr,peer); break;
case Packet::VERB_QOS_MEASUREMENT: r = _doQOS_MEASUREMENT(RR,tPtr,peer); break;
case Packet::VERB_ERROR: r = _doERROR(RR,tPtr,peer); break;
case Packet::VERB_OK: r = _doOK(RR,tPtr,peer); break;
@ -250,28 +251,47 @@ bool IncomingPacket::_doERROR(const RuntimeEnvironment *RR,void *tPtr,const Shar
return true;
}
bool IncomingPacket::_doQOS_MEASUREMENT(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
bool IncomingPacket::_doACK(const RuntimeEnvironment* RR, void* tPtr, const SharedPtr<Peer>& peer)
{
/*
SharedPtr<Bond> bond = peer->bond();
if (! bond || ! bond->rateGateACK(RR->node->now())) {
return true;
}
int32_t ackedBytes;
if (payloadLength() != sizeof(ackedBytes)) {
return true; // ignore
}
memcpy(&ackedBytes, payload(), sizeof(ackedBytes));
if (bond) {
bond->receivedAck(_path, RR->node->now(), Utils::ntoh(ackedBytes));
}
*/
return true;
}
bool IncomingPacket::_doQOS_MEASUREMENT(const RuntimeEnvironment* RR, void* tPtr, const SharedPtr<Peer>& peer)
{
SharedPtr<Bond> bond = peer->bond();
if (!bond || !bond->rateGateQoS(RR->node->now(), _path)) {
if (! bond || ! bond->rateGateQoS(RR->node->now(), _path)) {
return true;
}
if (payloadLength() > ZT_QOS_MAX_PACKET_SIZE || payloadLength() < ZT_QOS_MIN_PACKET_SIZE) {
return true; // ignore
return true; // ignore
}
const int64_t now = RR->node->now();
uint64_t rx_id[ZT_QOS_TABLE_SIZE];
uint16_t rx_ts[ZT_QOS_TABLE_SIZE];
char *begin = (char *)payload();
char *ptr = begin;
char* begin = (char*)payload();
char* ptr = begin;
int count = 0;
unsigned int len = payloadLength();
// Read packet IDs and latency compensation intervals for each packet tracked by this QoS packet
while (ptr < (begin + len) && (count < ZT_QOS_TABLE_SIZE)) {
memcpy((void*)&rx_id[count], ptr, sizeof(uint64_t));
ptr+=sizeof(uint64_t);
ptr += sizeof(uint64_t);
memcpy((void*)&rx_ts[count], ptr, sizeof(uint16_t));
ptr+=sizeof(uint16_t);
ptr += sizeof(uint16_t);
count++;
}
if (bond) {

View File

@ -116,6 +116,7 @@ private:
// been authenticated, decrypted, decompressed, and classified.
bool _doERROR(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
bool _doHELLO(const RuntimeEnvironment *RR,void *tPtr,const bool alreadyAuthenticated);
bool _doACK(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
bool _doQOS_MEASUREMENT(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
bool _doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
bool _doWHOIS(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);