diff --git a/node/Constants.hpp b/node/Constants.hpp index 1a47112a7..a2ba1c1a0 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -274,7 +274,7 @@ /** * No answer timeout to trigger dead path detection */ -#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 3000 +#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 2500 /** * Probation threshold after which a path becomes dead diff --git a/node/Path.hpp b/node/Path.hpp index 23e47408a..7e0a8ed1f 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -66,6 +66,8 @@ class Path public: Path() : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(), _localAddress(), @@ -76,6 +78,8 @@ public: Path(const InetAddress &localAddress,const InetAddress &addr) : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(addr), _localAddress(localAddress), @@ -98,10 +102,21 @@ public: * * @param t Time of send */ - inline void sent(uint64_t t) - { - _lastSend = t; - } + inline void sent(uint64_t t) { _lastSend = t; } + + /** + * Called when we've sent a ping or echo + * + * @param t Time of send + */ + inline void pinged(uint64_t t) { _lastPing = t; } + + /** + * Called when we send a NAT keepalive + * + * @param t Time of send + */ + inline void sentKeepalive(uint64_t t) { _lastKeepalive = t; } /** * Called when a packet is received from this remote path @@ -145,6 +160,16 @@ public: */ inline uint64_t lastSend() const throw() { return _lastSend; } + /** + * @return Time we last pinged or dead path checked this link + */ + inline uint64_t lastPing() const throw() { return _lastPing; } + + /** + * @return Time of last keepalive + */ + inline uint64_t lastKeepalive() const throw() { return _lastKeepalive; } + /** * @return Time of last receive from this path */ @@ -260,8 +285,10 @@ public: template inline void serialize(Buffer &b) const { - b.append((uint8_t)1); // version + b.append((uint8_t)2); // version b.append((uint64_t)_lastSend); + b.append((uint64_t)_lastPing); + b.append((uint64_t)_lastKeepalive); b.append((uint64_t)_lastReceived); _addr.serialize(b); _localAddress.serialize(b); @@ -273,9 +300,11 @@ public: inline unsigned int deserialize(const Buffer &b,unsigned int startAt = 0) { unsigned int p = startAt; - if (b[p++] != 1) + if (b[p++] != 2) throw std::invalid_argument("invalid serialized Path"); _lastSend = b.template at(p); p += 8; + _lastPing = b.template at(p); p += 8; + _lastKeepalive = b.template at(p); p += 8; _lastReceived = b.template at(p); p += 8; p += _addr.deserialize(b,p); p += _localAddress.deserialize(b,p); @@ -290,6 +319,8 @@ public: private: uint64_t _lastSend; + uint64_t _lastPing; + uint64_t _lastKeepalive; uint64_t _lastReceived; InetAddress _addr; InetAddress _localAddress; diff --git a/node/Peer.cpp b/node/Peer.cpp index a98d94c4b..aff610d52 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -168,7 +168,10 @@ void Peer::received( } else { uint64_t slotLRmin = 0xffffffffffffffffULL; for(unsigned int p=0;paddress().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); sendHELLO(p->localAddress(),p->address(),now); p->sent(now); - } else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { + p->pinged(now); + } else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); _natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf)); - p->sent(now); + p->sentKeepalive(now); } else { //TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived()); } @@ -339,6 +343,8 @@ bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now) unsigned int y = 0; while (x < np) { if (_paths[x].address().ipScope() == scope) { + // Resetting a path means sending a HELLO and then forgetting it. If we + // get OK(HELLO) then it will be re-learned. sendHELLO(_paths[x].localAddress(),_paths[x].address(),now); } else { _paths[y++] = _paths[x]; @@ -491,7 +497,22 @@ bool Peer::_checkPath(Path &p,const uint64_t now) if (!p.active(now)) return false; - if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { + /* Dead path detection: if we have sent something to this peer and have not + * yet received a reply, double check this path. The majority of outbound + * packets including Ethernet frames do generate some kind of reply either + * immediately or at some point in the near future. This will occasionally + * (every NO_ANSWER_TIMEOUT ms) check paths unnecessarily if traffic that + * does not generate a response is being sent such as multicast announcements + * or frames belonging to unidirectional UDP protocols, but the cost is very + * tiny and the benefit in reliability is very large. This takes care of many + * failure modes including crap NATs that forget links and spurious changes + * to physical network topology that cannot be otherwise detected. + * + * Each time we do this we increment a probation counter in the path. This + * counter is reset on any packet receive over this path. If it reaches the + * MAX_PROBATION threshold the path is considred dead. */ + + if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { @@ -499,9 +520,11 @@ bool Peer::_checkPath(Path &p,const uint64_t now) Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); outp.armor(_key,true); p.send(RR,outp.data(),outp.size(),now); + p.pinged(now); } else { sendHELLO(p.localAddress(),p.address(),now); p.sent(now); + p.pinged(now); } p.increaseProbation();