Introduced basic multipath support

This commit is contained in:
Joseph Henry 2018-05-01 16:32:15 -07:00
parent 836d897aec
commit 6a2ba4baca
17 changed files with 1314 additions and 45 deletions

View File

@ -422,6 +422,42 @@ enum ZT_ResultCode
*/
#define ZT_ResultCode_isFatal(x) ((((int)(x)) >= 100)&&(((int)(x)) < 1000))
/**
* The multipath algorithm in use by this node.
*/
enum ZT_MultipathMode
{
/**
* No active multipath.
*
* Traffic is merely sent over the strongest path. That being
* said, this mode will automatically failover in the event that a link goes down.
*/
ZT_MULTIPATH_NONE = 0,
/**
* Traffic is randomly distributed among all active paths.
*
* Will cease sending traffic over links that appear to be stale.
*/
ZT_MULTIPATH_RANDOM = 1,
/**
* Traffic is allocated across all active paths in proportion to their strength and
* reliability.
*
* Will cease sending traffic over links that appear to be stale.
*/
ZT_MULTIPATH_PROPORTIONALLY_BALANCED = 2,
/**
* Traffic is allocated across a user-defined interface/allocation
*
* Will cease sending traffic over links that appear to be stale.
*/
ZT_MULTIPATH_MANUALLY_BALANCED = 3
};
/**
* Status codes sent to status update callback when things happen
*/

View File

@ -267,6 +267,98 @@
*/
#define ZT_PING_CHECK_INVERVAL 5000
/**
* Length of interface name
*/
#define ZT_PATH_INTERFACE_NAME_SZ 16
/**
* How frequently to check for changes to the system's network interfaces. When
* the service decides to use this constant it's because we want to react more
* quickly to new interfaces that pop up or go down.
*/
#define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000
/**
* Path choice history window size. This is used to keep track of which paths were
* previously selected so that we can maintain a target allocation over time.
*/
#define ZT_MULTIPATH_PROPORTION_WIN_SZ 128
/**
* Threshold for flow to be considered balanced.
*/
#define ZT_MULTIPATH_FLOW_BALANCE_THESHOLD 0.80
/**
* Number of samples to consider when computing path statistics
*/
#define ZT_PATH_QUALITY_METRIC_WIN_SZ 128
/**
* How often important path metrics are sampled (in ms). These metrics are later used
* for path quality estimates
*/
#define ZT_PATH_QUALITY_SAMPLE_INTERVAL 100
/**
* How often new path quality estimates are computed
*/
#define ZT_PATH_QUALITY_ESTIMATE_INTERVAL 100
/**
* How often we will sample packet latency. Should be at least greater than ZT_PING_CHECK_INVERVAL
* since we will record a 0 bit/s measurement if no valid latency measurement was made within this
* window of time.
*/
#define ZT_PATH_LATENCY_SAMPLE_INTERVAL ZT_PING_CHECK_INVERVAL * 2
/**
* Interval used for rate-limiting the computation of path quality estimates. Set at 0
* to compute as new packets arrive with no delay.
*/
#define ZT_PATH_QUALITY_COMPUTE_INTERVAL 0
/**
* Path error rate history window size. This is used to keep track of packet error
* measurements over a path's medium-term history.
*/
#define ZT_PATH_ERROR_HIST_WIN_SZ 10
/**
* The number of packet error measurements in each sample
*/
#define ZT_PATH_ERROR_SAMPLE_WIN_SZ 1024
/**
* How often a peer will prune its own paths. Pruning is important when multipath is
* enabled because we want to prevent the allocation algorithms from sending anything
* out on known dead paths. Additionally, quickly marking paths as dead helps when
* a new path is learned and needs to replace an older path.
*/
#define ZT_CLOSED_PATH_PRUNING_INTERVAL 1000
/**
* Datagram used to test link throughput. Contents are random.
*/
#define ZT_LINK_TEST_DATAGRAM_SZ 1024
/**
* Size of datagram expected as a reply to a link speed test
*/
#define ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ 8
/**
* Time before a link test datagram is considered lost. Any corresponding
* timing records that would have been used to compute a RTT are purged.
*/
#define ZT_LINK_TEST_TIMEOUT 10000
/**
* How often the service tests the link throughput.
*/
#define ZT_LINK_SPEED_TEST_INTERVAL 1000
/**
* How frequently to send heartbeats over in-use paths
*/

View File

@ -80,6 +80,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
if (!trusted) {
if (!dearmor(peer->key())) {
RR->t->incomingPacketMessageAuthenticationFailure(tPtr,_path,packetId(),sourceAddress,hops(),"invalid MAC");
_path->recordPacket(false);
return true;
}
}
@ -89,6 +90,8 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
return true;
}
_path->recordPacket(true);
const Packet::Verb v = verb();
switch(v) {
//case Packet::VERB_NOP:
@ -446,7 +449,7 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP
}
if (!hops())
_path->updateLatency((unsigned int)latency);
_path->updateLatency((unsigned int)latency, RR->node->now());
peer->setRemoteVersion(vProto,vMajor,vMinor,vRevision);

View File

@ -190,7 +190,7 @@ void Multicaster::send(
for(unsigned int i=0;i<multicastReplicatorCount;++i) {
const SharedPtr<Peer> p(RR->topology->getPeerNoCache(multicastReplicators[i]));
if ((p)&&(p->isAlive(now))) {
const SharedPtr<Path> pp(p->getBestPath(now,false));
const SharedPtr<Path> pp(p->getAppropriatePath(now,false));
if ((pp)&&(pp->latency() < bestMulticastReplicatorLatency)) {
bestMulticastReplicatorLatency = pp->latency();
bestMulticastReplicatorPath = pp;

View File

@ -234,7 +234,7 @@ public:
}
if ((!contacted)&&(_bestCurrentUpstream)) {
const SharedPtr<Path> up(_bestCurrentUpstream->getBestPath(_now,true));
const SharedPtr<Path> up(_bestCurrentUpstream->getAppropriatePath(_now,true));
if (up)
p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now);
}
@ -465,7 +465,7 @@ ZT_PeerList *Node::peers() const
p->role = RR->topology->role(pi->second->identity().address());
std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
SharedPtr<Path> bestp(pi->second->getBestPath(_now,false));
SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false));
p->pathCount = 0;
for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));

View File

@ -260,6 +260,9 @@ public:
inline const Address &remoteTraceTarget() const { return _remoteTraceTarget; }
inline Trace::Level remoteTraceLevel() const { return _remoteTraceLevel; }
inline void setMultipathMode(uint8_t mode) { _multipathMode = mode; }
inline uint8_t getMultipathMode() { return _multipathMode; }
private:
RuntimeEnvironment _RR;
RuntimeEnvironment *RR;
@ -284,6 +287,8 @@ private:
Address _remoteTraceTarget;
enum Trace::Level _remoteTraceLevel;
uint8_t _multipathMode;
volatile int64_t _now;
int64_t _lastPingCheck;
int64_t _lastHousekeepingRun;

View File

@ -39,6 +39,9 @@
#include "SharedPtr.hpp"
#include "AtomicCounter.hpp"
#include "Utils.hpp"
#include "RingBuffer.hpp"
#include "../osdep/Phy.hpp"
/**
* Maximum return value of preferenceRank()
@ -55,6 +58,7 @@ class RuntimeEnvironment;
class Path
{
friend class SharedPtr<Path>;
Phy<Path *> *_phy;
public:
/**
@ -93,22 +97,71 @@ public:
_lastOut(0),
_lastIn(0),
_lastTrustEstablishedPacketReceived(0),
_lastPathQualityComputeTime(0),
_localSocket(-1),
_latency(0xffff),
_addr(),
_ipScope(InetAddress::IP_SCOPE_NONE)
_ipScope(InetAddress::IP_SCOPE_NONE),
_currentPacketSampleCounter(0),
_meanPacketErrorRatio(0.0),
_meanLatency(0.0),
_lastLatencyUpdate(0),
_jitter(0.0),
_lastPathQualitySampleTime(0),
_lastComputedQuality(0.0),
_lastPathQualityEstimate(0),
_meanAge(0.0),
_meanThroughput(0.0),
_packetLossRatio(0)
{
memset(_ifname, 0, sizeof(_ifname));
memset(_addrString, 0, sizeof(_addrString));
_throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
}
Path(const int64_t localSocket,const InetAddress &addr) :
_lastOut(0),
_lastIn(0),
_lastTrustEstablishedPacketReceived(0),
_lastPathQualityComputeTime(0),
_localSocket(localSocket),
_latency(0xffff),
_addr(addr),
_ipScope(addr.ipScope())
_ipScope(addr.ipScope()),
_currentPacketSampleCounter(0),
_meanPacketErrorRatio(0.0),
_meanLatency(0.0),
_lastLatencyUpdate(0),
_jitter(0.0),
_lastPathQualitySampleTime(0),
_lastComputedQuality(0.0),
_lastPathQualityEstimate(0),
_meanAge(0.0),
_meanThroughput(0.0),
_packetLossRatio(0)
{
memset(_ifname, 0, sizeof(_ifname));
memset(_addrString, 0, sizeof(_addrString));
_throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
_errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
}
~Path()
{
delete _throughputSamples;
delete _ageSamples;
delete _latencySamples;
delete _errSamples;
_throughputSamples = NULL;
_ageSamples = NULL;
_latencySamples = NULL;
_errSamples = NULL;
}
/**
@ -147,12 +200,17 @@ public:
*
* @param l Measured latency
*/
inline void updateLatency(const unsigned int l)
inline void updateLatency(const unsigned int l, int64_t now)
{
unsigned int pl = _latency;
if (pl < 0xffff)
if (pl < 0xffff) {
_latency = (pl + l) / 2;
else _latency = l;
}
else {
_latency = l;
}
_lastLatencyUpdate = now;
_latencySamples->push(l);
}
/**
@ -240,11 +298,180 @@ public:
return (((age < (ZT_PATH_HEARTBEAT_PERIOD + 5000)) ? l : (l + 0xffff + age)) * (long)((ZT_INETADDRESS_MAX_SCOPE - _ipScope) + 1));
}
/**
* @return An estimate of path quality -- higher is better.
*/
inline float computeQuality(const int64_t now)
{
float latency_contrib = _meanLatency ? 1.0 / _meanLatency : 0;
float jitter_contrib = _jitter ? 1.0 / _jitter : 0;
float throughput_contrib = _meanThroughput ? _meanThroughput / 1000000 : 0; // in Mbps
float age_contrib = _meanAge > 0 ? (float)sqrt(_meanAge) : 1;
float error_contrib = 1.0 - _meanPacketErrorRatio;
float sum = (latency_contrib + jitter_contrib + throughput_contrib + error_contrib) / age_contrib;
_lastComputedQuality = sum * (long)((_ipScope) + 1);
return _lastComputedQuality;
}
/**
* Since quality estimates can become expensive we should cache the most recent result for traffic allocation
* algorithms which may need to reference this value multiple times through the course of their execution.
*/
inline float lastComputedQuality() {
return _lastComputedQuality;
}
/**
* @return A pointer to a cached copy of the human-readable name of the interface this Path's localSocket is bound to
*/
inline char *getName() { return _ifname; }
/**
* @return Estimated throughput in bps of this link
*/
inline uint64_t getThroughput() { return _phy->getThroughput((PhySocket *)((uintptr_t)_localSocket)); }
/**
* @return Packet delay varience
*/
inline float jitter() { return _jitter; }
/**
* @return Previously-computed mean latency
*/
inline float meanLatency() { return _meanLatency; }
/**
* @return Packet loss rate
*/
inline float packetLossRatio() { return _packetLossRatio; }
/**
* @return Mean packet error ratio
*/
inline float meanPacketErrorRatio() { return _meanPacketErrorRatio; }
/**
* @return Current packet error ratio (possibly incomplete sample set)
*/
inline float currentPacketErrorRatio() {
int errorsPerSample = 0;
for (int i=0; i<_currentPacketSampleCounter; i++) {
if (_packetValidity[i] == false) {
errorsPerSample++;
}
}
return (float)errorsPerSample / (float)ZT_PATH_ERROR_SAMPLE_WIN_SZ;
}
/**
* @return Whether the Path's local socket is in a CLOSED state
*/
inline bool isClosed() { return _phy->isClosed((PhySocket *)((uintptr_t)_localSocket)); }
/**
* @return The state of a Path's local socket
*/
inline int getState() { return _phy->getState((PhySocket *)((uintptr_t)_localSocket)); }
/**
* @return Whether this socket may have been erased by the virtual physical link layer
*/
inline bool isValidState() { return _phy->isValidState((PhySocket *)((uintptr_t)_localSocket)); }
/**
* @return Whether the path quality monitors have collected enough data to provide a quality value
* TODO: expand this
*/
inline bool monitorsReady() {
return _latencySamples->count() && _ageSamples->count() && _throughputSamples->count();
}
/**
* @return A pointer to a cached copy of the address string for this Path (For debugging only)
*/
inline char *getAddressString() { return _addrString; }
/**
* Handle path sampling, computation of quality estimates, and other periodic tasks
* @param now Current time
*/
inline void measureLink(int64_t now) {
// Sample path properties and store them in a continuously-revolving buffer
if (now - _lastPathQualitySampleTime > ZT_PATH_QUALITY_SAMPLE_INTERVAL) {
_lastPathQualitySampleTime = now;
_throughputSamples->push(getThroughput()); // Thoughtput in bits/s
_ageSamples->push(now - _lastIn); // Age (time since last received packet)
if (now - _lastLatencyUpdate > ZT_PATH_LATENCY_SAMPLE_INTERVAL) {
_lastLatencyUpdate = now;
// Record 0 bp/s. Since we're using this to detect possible packet loss
updateLatency(0, now);
}
}
// Compute statistical values for use in link quality estimates
if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
_lastPathQualityComputeTime = now;
// Cache Path address string
address().toString(_addrString);
_phy->getIfName((PhySocket *)((uintptr_t)_localSocket), _ifname, ZT_PATH_INTERFACE_NAME_SZ); // Cache Interface name
// Derived values
if (_throughputSamples->count()) {
_packetLossRatio = (float)_throughputSamples->zeroCount() / (float)_throughputSamples->count();
}
_meanThroughput = _throughputSamples->mean();
_meanAge = _ageSamples->mean();
_meanLatency = _latencySamples->mean();
// Jitter
// SEE: RFC 3393, RFC 4689
_jitter = _latencySamples->stddev();
_meanPacketErrorRatio = _errSamples->mean(); // Packet Error Ratio (PER)
}
// Periodically compute a path quality estimate
if (now - _lastPathQualityEstimate > ZT_PATH_QUALITY_ESTIMATE_INTERVAL) {
computeQuality(now);
}
}
/**
* Record whether a packet is considered invalid by MAC/compression/cipher checks. This
* could be an indication of a bit error. This function will keep a running counter of
* up to a given window size and with each counter overflow it will compute a mean error rate
* and store that in a continuously shifting sample window.
*
* @param isValid Whether the packet in question is considered invalid
*/
inline void recordPacket(bool isValid) {
if (_currentPacketSampleCounter < ZT_PATH_ERROR_SAMPLE_WIN_SZ) {
_packetValidity[_currentPacketSampleCounter] = isValid;
_currentPacketSampleCounter++;
}
else {
// Sample array is full, compute an mean and stick it in the ring buffer for trend analysis
_errSamples->push(currentPacketErrorRatio());
_currentPacketSampleCounter=0;
}
}
/**
* @return The mean age (in ms) of this link
*/
inline float meanAge() { return _meanAge; }
/**
* @return The mean throughput (in bits/s) of this link
*/
inline float meanThroughput() { return _meanThroughput; }
/**
* @return True if this path is alive (receiving heartbeats)
*/
inline bool alive(const int64_t now) const { return ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000)); }
/**
* @return True if this path hasn't received a packet in a "significant" amount of time
*/
inline bool stale(const int64_t now) const { return ((now - _lastIn) > ZT_LINK_SPEED_TEST_INTERVAL * 10); }
/**
* @return True if this path needs a heartbeat
*/
@ -269,11 +496,39 @@ private:
volatile int64_t _lastOut;
volatile int64_t _lastIn;
volatile int64_t _lastTrustEstablishedPacketReceived;
volatile int64_t _lastPathQualityComputeTime;
int64_t _localSocket;
volatile unsigned int _latency;
InetAddress _addr;
InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often
AtomicCounter __refCount;
// Packet Error Ratio (PER)
int _packetValidity[ZT_PATH_ERROR_SAMPLE_WIN_SZ];
int _currentPacketSampleCounter;
volatile float _meanPacketErrorRatio;
// Latency and Jitter
volatile float _meanLatency;
int64_t _lastLatencyUpdate;
volatile float _jitter;
int64_t _lastPathQualitySampleTime;
float _lastComputedQuality;
int64_t _lastPathQualityEstimate;
float _meanAge;
float _meanThroughput;
// Circular buffers used to efficiently store large time series
RingBuffer<uint64_t> *_throughputSamples;
RingBuffer<uint32_t> *_latencySamples;
RingBuffer<uint64_t> *_ageSamples;
RingBuffer<float> *_errSamples;
float _packetLossRatio;
char _ifname[ZT_PATH_INTERFACE_NAME_SZ];
char _addrString[256];
};
} // namespace ZeroTier

View File

@ -35,6 +35,7 @@
#include "Packet.hpp"
#include "Trace.hpp"
#include "InetAddress.hpp"
#include "RingBuffer.hpp"
namespace ZeroTier {
@ -59,10 +60,14 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
_vRevision(0),
_id(peerIdentity),
_directPathPushCutoffCount(0),
_credentialsCutoffCount(0)
_credentialsCutoffCount(0),
_linkBalanceStatus(false),
_linkRedundancyStatus(false)
{
if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH))
throw ZT_EXCEPTION_INVALID_ARGUMENT;
_pathChoiceHist = new RingBuffer<int>(ZT_MULTIPATH_PROPORTION_WIN_SZ);
_flowBalanceHist = new RingBuffer<float>(ZT_MULTIPATH_PROPORTION_WIN_SZ);
}
void Peer::received(
@ -95,6 +100,18 @@ void Peer::received(
path->trustedPacketReceived(now);
}
if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) {
if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) {
_lastPathPrune = now;
prunePaths();
}
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
_paths[i].p->measureLink(now);
}
}
}
if (hops == 0) {
// If this is a direct packet (no hops), update existing paths or learn new ones
@ -232,26 +249,246 @@ void Peer::received(
}
}
SharedPtr<Path> Peer::getBestPath(int64_t now,bool includeExpired) const
SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
{
Mutex::Lock _l(_paths_m);
unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS;
long bestPathQuality = 2147483647;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) {
const long q = _paths[i].p->quality(now) / _paths[i].priority;
if (q <= bestPathQuality) {
bestPathQuality = q;
bestPath = i;
/**
* Send traffic across the highest quality path only. This algorithm will still
* use the old path quality metric.
*/
if (RR->node->getMultipathMode() == ZT_MULTIPATH_NONE) {
long bestPathQuality = 2147483647;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p && _paths[i].p->isValidState()) {
if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) {
const long q = _paths[i].p->quality(now) / _paths[i].priority;
if (q <= bestPathQuality) {
bestPathQuality = q;
bestPath = i;
}
}
}
} else break;
} else break;
}
if (bestPath != ZT_MAX_PEER_NETWORK_PATHS) {
return _paths[bestPath].p;
}
return SharedPtr<Path>();
}
if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) {
_lastPathPrune = now;
prunePaths();
}
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
_paths[i].p->measureLink(now);
}
}
/**
* Randomly distribute traffic across all paths
*
* Behavior:
* - If path DOWN: Stop randomly choosing that path
* - If path UP: Start randomly choosing that path
* - If all paths are unresponsive: randomly choose from all paths
*/
int numAlivePaths = 0;
int numStalePaths = 0;
if (RR->node->getMultipathMode() == ZT_MULTIPATH_RANDOM) {
int alivePaths[ZT_MAX_PEER_NETWORK_PATHS];
int stalePaths[ZT_MAX_PEER_NETWORK_PATHS];
memset(&alivePaths, -1, sizeof(alivePaths));
memset(&stalePaths, -1, sizeof(stalePaths));
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
if (_paths[i].p->isValidState()) {
if (_paths[i].p->alive(now)) {
alivePaths[numAlivePaths] = i;
numAlivePaths++;
}
else {
stalePaths[numStalePaths] = i;
numStalePaths++;
}
}
}
}
unsigned int r;
Utils::getSecureRandom(&r, 1);
if (numAlivePaths > 0) {
// pick a random out of the set deemed "alive"
int rf = (float)(r %= numAlivePaths);
return _paths[alivePaths[rf]].p;
}
else if(numStalePaths > 0) {
// resort to trying any non-expired path
int rf = (float)(r %= numStalePaths);
return _paths[stalePaths[rf]].p;
}
}
/**
* Proportionally allocate traffic according to dynamic path quality measurements
*/
if (RR->node->getMultipathMode() == ZT_MULTIPATH_PROPORTIONALLY_BALANCED) {
float relq[ZT_MAX_PEER_NETWORK_PATHS];
memset(&relq, 0, sizeof(relq));
float alloc[ZT_MAX_PEER_NETWORK_PATHS];
memset(&alloc, 0, sizeof(alloc));
// Survey
//
// Take a survey of all available link qualities. We use this to determine if we
// can skip this algorithm altogether and if not, to establish baseline for physical
// link quality used in later calculations.
//
// We find the min/max quality of our currently-active links so
// that we can form a relative scale to rank each link proportionally
// to each other link.
uint16_t alivePaths[ZT_MAX_PEER_NETWORK_PATHS];
uint16_t stalePaths[ZT_MAX_PEER_NETWORK_PATHS];
memset(&alivePaths, -1, sizeof(alivePaths));
memset(&stalePaths, -1, sizeof(stalePaths));
uint16_t numAlivePaths = 0;
uint16_t numStalePaths = 0;
float minQuality = 10000;
float maxQuality = -1;
float currQuality;
for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p && _paths[i].p->isValidState()) {
if (!_paths[i].p->monitorsReady()) {
// TODO: This should fix itself anyway but we should test whether forcing the use of a new path will
// aid in establishing flow balance more quickly.
}
// Compute quality here, going forward we will use lastComputedQuality()
currQuality = _paths[i].p->computeQuality(now);
if (!_paths[i].p->stale(now)) {
alivePaths[i] = currQuality;
numAlivePaths++;
}
else {
stalePaths[i] = currQuality;
numStalePaths++;
}
if (currQuality > maxQuality) {
maxQuality = currQuality;
bestPath = i;
}
if (currQuality < minQuality) {
minQuality = currQuality;
}
relq[i] = currQuality;
}
}
// Attempt to find an excuse not to use the rest of this algorithm
if (bestPath == ZT_MAX_PEER_NETWORK_PATHS || (numAlivePaths == 0 && numStalePaths == 0)) {
return SharedPtr<Path>();
} if (numAlivePaths == 1) {
return _paths[bestPath].p;
} if (numStalePaths == 1) {
return _paths[bestPath].p;
}
// Relative quality
//
// The strongest link will have a value of 1.0 whereas every other
// link will have a value which represents some fraction of the strongest link.
float totalRelativeQuality = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p && _paths[i].p->isValidState()) {
relq[i] /= maxQuality ? maxQuality : 1;
totalRelativeQuality += relq[i];
}
}
// Convert the relative quality values into flow allocations.
// Additionally, determine whether each path in the flow is
// contributing more or less than its target allocation. If
// it is contributing more than required, don't allow it to be
// randomly selected for the next packet. If however the path
// needs to contribute more to the flow, we should record
float imbalance = 0;
float qualityScalingFactor = 1.0 / totalRelativeQuality;
for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
// Out of the last N packets to this peer, how many were sent by this path?
int numPktSentWithinWin = (int)_pathChoiceHist->countValue((float)i);
// Compute traffic allocation for each path in the flow
if (_paths[i].p && _paths[i].p->isValidState()) {
// Allocation
// This is the percentage of traffic we want to send over a given path
alloc[i] = relq[i] * qualityScalingFactor;
float currProportion = numPktSentWithinWin / (float)ZT_MULTIPATH_PROPORTION_WIN_SZ;
float targetProportion = alloc[i];
float diffProportion = currProportion - targetProportion;
// Imbalance
//
// This is the sum of the distances of each path's currently observed flow contributions
// from its most recent target allocation. In other words, this is a measure of how closely we
// are adhering to our desired allocations. It is worth noting that this value can be greater
// than 1.0 if a significant change to allocations is made by the algorithm, this will
// eventually correct itself.
imbalance += fabs(diffProportion);
if (diffProportion < 0) {
alloc[i] = targetProportion;
}
else {
alloc[i] = targetProportion;
}
}
}
// Compute and record current flow balance
float balance = 1.0 - imbalance;
if (balance >= ZT_MULTIPATH_FLOW_BALANCE_THESHOLD) {
if (!_linkBalanceStatus) {
_linkBalanceStatus = true;
RR->t->peerLinkBalanced(NULL,0,*this);
}
}
else {
if (_linkBalanceStatus) {
_linkBalanceStatus = false;
RR->t->peerLinkImbalanced(NULL,0,*this);
}
}
// Record the current flow balance. Later used for computing a mean flow balance value.
_flowBalanceHist->push(balance);
// Randomly choose path from allocated candidates
unsigned int r;
Utils::getSecureRandom(&r, 1);
float rf = (float)(r %= 100) / 100;
for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p && _paths[i].p->isValidState() && _paths[i].p->address().isV4()) {
if (alloc[i] > 0 && rf < alloc[i]) {
bestPath = i;
_pathChoiceHist->push(bestPath); // Record which path we chose
break;
}
if (alloc[i] > 0) {
rf -= alloc[i];
}
else {
rf -= alloc[i]*-1;
}
}
}
if (bestPath < ZT_MAX_PEER_NETWORK_PATHS) {
return _paths[bestPath].p;
}
return SharedPtr<Path>();
}
// Adhere to a user-defined interface/allocation scheme
if (RR->node->getMultipathMode() == ZT_MULTIPATH_MANUALLY_BALANCED) {
// TODO
}
if (bestPath != ZT_MAX_PEER_NETWORK_PATHS)
return _paths[bestPath].p;
return SharedPtr<Path>();
}
@ -477,16 +714,34 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
}
} else break;
}
while(j < ZT_MAX_PEER_NETWORK_PATHS) {
_paths[j].lr = 0;
_paths[j].p.zero();
_paths[j].priority = 1;
++j;
if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) {
while(j < ZT_MAX_PEER_NETWORK_PATHS) {
_paths[j].lr = 0;
_paths[j].p.zero();
_paths[j].priority = 1;
++j;
}
}
return sent;
}
unsigned int Peer::prunePaths()
{
Mutex::Lock _l(_paths_m);
unsigned int pruned = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
if(_paths[i].p->isClosed() || !_paths[i].p->isValidState()) {
_paths[i].lr = 0;
_paths[i].p.zero();
_paths[i].priority = 1;
pruned++;
}
}
}
return pruned;
}
void Peer::clusterRedirect(void *tPtr,const SharedPtr<Path> &originatingPath,const InetAddress &remoteAddress,const int64_t now)
{
SharedPtr<Path> np(RR->topology->getPath(originatingPath->localSocket(),remoteAddress));

View File

@ -65,7 +65,13 @@ private:
Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized
public:
~Peer() { Utils::burn(_key,sizeof(_key)); }
~Peer() {
Utils::burn(_key,sizeof(_key));
delete _pathChoiceHist;
delete _flowBalanceHist;
_pathChoiceHist = NULL;
_flowBalanceHist = NULL;
}
/**
* Construct a new peer
@ -145,20 +151,20 @@ public:
*/
inline bool sendDirect(void *tPtr,const void *data,unsigned int len,int64_t now,bool force)
{
SharedPtr<Path> bp(getBestPath(now,force));
SharedPtr<Path> bp(getAppropriatePath(now,force));
if (bp)
return bp->send(RR,tPtr,data,len,now);
return false;
}
/**
* Get the best current direct path
* Get the most appropriate direct path based on current multipath configuration
*
* @param now Current time
* @param includeExpired If true, include even expired paths
* @return Best current path or NULL if none
*/
SharedPtr<Path> getBestPath(int64_t now,bool includeExpired) const;
SharedPtr<Path> getAppropriatePath(int64_t now, bool includeExpired);
/**
* Send VERB_RENDEZVOUS to this and another peer via the best common IP scope and path
@ -212,6 +218,16 @@ public:
*/
unsigned int doPingAndKeepalive(void *tPtr,int64_t now);
/**
* Clear paths whose localSocket(s) are in a CLOSED state or have an otherwise INVALID state.
* This should be called frequently so that we can detect and remove unproductive or invalid paths.
*
* Under the hood this is done periodically based on ZT_CLOSED_PATH_PRUNING_INTERVAL.
*
* @return Number of paths that were pruned this round
*/
unsigned int prunePaths();
/**
* Process a cluster redirect sent by this peer
*
@ -270,9 +286,9 @@ public:
/**
* @return Latency in milliseconds of best path or 0xffff if unknown / no paths
*/
inline unsigned int latency(const int64_t now) const
inline unsigned int latency(const int64_t now)
{
SharedPtr<Path> bp(getBestPath(now,false));
SharedPtr<Path> bp(getAppropriatePath(now,false));
if (bp)
return bp->latency();
return 0xffff;
@ -289,7 +305,7 @@ public:
*
* @return Relay quality score computed from latency and other factors, lower is better
*/
inline unsigned int relayQuality(const int64_t now) const
inline unsigned int relayQuality(const int64_t now)
{
const uint64_t tsr = now - _lastReceive;
if (tsr >= ZT_PEER_ACTIVITY_TIMEOUT)
@ -515,6 +531,7 @@ private:
int64_t _lastCredentialsReceived;
int64_t _lastTrustEstablishedPacketReceived;
int64_t _lastSentFullHello;
int64_t _lastPathPrune;
uint16_t _vProto;
uint16_t _vMajor;
@ -530,6 +547,13 @@ private:
unsigned int _credentialsCutoffCount;
AtomicCounter __refCount;
RingBuffer<int> *_pathChoiceHist;
RingBuffer<float> *_flowBalanceHist;
bool _linkBalanceStatus;
bool _linkRedundancyStatus;
};
} // namespace ZeroTier

315
node/RingBuffer.hpp Normal file
View File

@ -0,0 +1,315 @@
/*
* ZeroTier One - Network Virtualization Everywhere
* Copyright (C) 2011-2018 ZeroTier, Inc. https://www.zerotier.com/
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* --
*
* You can be released from the requirements of the license by purchasing
* a commercial license. Buying such a license is mandatory as soon as you
* develop commercial closed-source software that incorporates or links
* directly against ZeroTier software without disclosing the source code
* of your own application.
*/
#ifndef ZT_RINGBUFFER_H
#define ZT_RINGBUFFER_H
#include <typeinfo>
#include <cstdint>
#include <stdlib.h>
#include <memory.h>
#include <algorithm>
#include <math.h>
namespace ZeroTier {
/**
* A revolving (ring) buffer.
*
* For fast handling of continuously-evolving variables (such as path quality metrics).
* Using this, we can maintain longer sliding historical windows for important path
* metrics without the need for potentially expensive calls to memcpy/memmove.
*
* Some basic statistical functionality is implemented here in an attempt
* to reduce the complexity of code needed to interact with this type of buffer.
*/
template <class T>
class RingBuffer
{
private:
T * buf;
size_t size;
size_t begin;
size_t end;
bool wrap;
public:
/**
* create a RingBuffer with space for up to size elements.
*/
explicit RingBuffer(size_t size)
: size(size),
begin(0),
end(0),
wrap(false)
{
buf = new T[size];
memset(buf, 0, sizeof(T) * size);
}
/**
* @return A pointer to the underlying buffer
*/
T* get_buf()
{
return buf + begin;
}
/**
* Adjust buffer index pointer as if we copied data in
* @param n Number of elements to copy in
* @return Number of elements we copied in
*/
size_t produce(size_t n)
{
n = std::min(n, getFree());
if (n == 0) {
return n;
}
const size_t first_chunk = std::min(n, size - end);
end = (end + first_chunk) % size;
if (first_chunk < n) {
const size_t second_chunk = n - first_chunk;
end = (end + second_chunk) % size;
}
if (begin == end) {
wrap = true;
}
return n;
}
/**
* Fast erase, O(1).
* Merely reset the buffer pointer, doesn't erase contents
*/
void reset()
{
consume(count());
}
/**
* adjust buffer index pointer as if we copied data out
* @param n Number of elements we copied from the buffer
* @return Number of elements actually available from the buffer
*/
size_t consume(size_t n)
{
n = std::min(n, count());
if (n == 0) {
return n;
}
if (wrap) {
wrap = false;
}
const size_t first_chunk = std::min(n, size - begin);
begin = (begin + first_chunk) % size;
if (first_chunk < n) {
const size_t second_chunk = n - first_chunk;
begin = (begin + second_chunk) % size;
}
return n;
}
/**
* @param data Buffer that is to be written to the ring
* @param n Number of elements to write to the buffer
*/
size_t write(const T * data, size_t n)
{
n = std::min(n, getFree());
if (n == 0) {
return n;
}
const size_t first_chunk = std::min(n, size - end);
memcpy(buf + end, data, first_chunk * sizeof(T));
end = (end + first_chunk) % size;
if (first_chunk < n) {
const size_t second_chunk = n - first_chunk;
memcpy(buf + end, data + first_chunk, second_chunk * sizeof(T));
end = (end + second_chunk) % size;
}
if (begin == end) {
wrap = true;
}
return n;
}
/**
* Place a single value on the buffer. If the buffer is full, consume a value first.
*
* @param value A single value to be placed in the buffer
*/
void push(const T value)
{
if (count() == size) {
consume(1);
}
write(&value, 1);
}
/**
* @param dest Destination buffer
* @param n Size (in terms of number of elements) of the destination buffer
* @return Number of elements read from the buffer
*/
size_t read(T * dest, size_t n)
{
n = std::min(n, count());
if (n == 0) {
return n;
}
if (wrap) {
wrap = false;
}
const size_t first_chunk = std::min(n, size - begin);
memcpy(dest, buf + begin, first_chunk * sizeof(T));
begin = (begin + first_chunk) % size;
if (first_chunk < n) {
const size_t second_chunk = n - first_chunk;
memcpy(dest + first_chunk, buf + begin, second_chunk * sizeof(T));
begin = (begin + second_chunk) % size;
}
return n;
}
/**
* Return how many elements are in the buffer, O(1).
*
* @return The number of elements in the buffer
*/
size_t count()
{
if (end == begin) {
return wrap ? size : 0;
}
else if (end > begin) {
return end - begin;
}
else {
return size + end - begin;
}
}
/**
* @return The number of slots that are unused in the buffer
*/
size_t getFree()
{
return size - count();
}
/**
* @return The arithmetic mean of the contents of the buffer
*/
T mean()
{
size_t iterator = begin;
T mean = 0;
for (int i=0; i<size; i++) {
iterator = (iterator + size - 1) % size;
mean += *(buf + iterator);
}
return count() ? mean / (T)count() : 0;
}
/**
* @return The sample standard deviation of the contents of the ring buffer
*/
T stddev()
{
size_t iterator = begin;
T cached_mean = mean();
if (size) {
T sum_of_squared_deviations = 0;
for (int i=0; i<size; i++) {
iterator = (iterator + size - 1) % size;
T deviation = (buf[i] - cached_mean);
T sdev = deviation*deviation;
sum_of_squared_deviations += sdev;
}
T variance = sum_of_squared_deviations / (size - 1);
T sd = sqrt(variance);
return sd;
}
return 0;
}
/**
* @return The number of elements of zero value, O(n)
*/
size_t zeroCount()
{
size_t iterator = begin;
size_t zeros = 0;
for (int i=0; i<size; i++) {
iterator = (iterator + size - 1) % size;
if (*(buf + iterator) == 0) {
zeros++;
}
}
return zeros;
}
/**
* @param value Value to match against in buffer
* @return The number of values held in the ring buffer which match a given value
*/
size_t countValue(T value)
{
size_t iterator = begin;
size_t count = 0;
for (int i=0; i<size; i++) {
iterator = (iterator + size - 1) % size;
if (*(buf + iterator) == value) {
count++;
}
}
return count;
}
/**
* Print the contents of the buffer
*/
void dump()
{
size_t iterator = begin;
for (int i=0; i<size; i++) {
iterator = (iterator + size - 1) % size;
if (typeid(T) == typeid(int)) {
// DEBUG_INFO("buf[%2zu]=%2d", iterator, (int)*(buf + iterator));
}
else {
// DEBUG_INFO("buf[%2zu]=%2f", iterator, (float)*(buf + iterator));
}
}
}
};
} // namespace ZeroTier
#endif

View File

@ -646,12 +646,12 @@ bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt)
const SharedPtr<Peer> peer(RR->topology->getPeer(tPtr,destination));
if (peer) {
viaPath = peer->getBestPath(now,false);
viaPath = peer->getAppropriatePath(now,false);
if (!viaPath) {
peer->tryMemorizedPath(tPtr,now); // periodically attempt memorized or statically defined paths, if any are known
const SharedPtr<Peer> relay(RR->topology->getUpstreamPeer());
if ( (!relay) || (!(viaPath = relay->getBestPath(now,false))) ) {
if (!(viaPath = peer->getBestPath(now,true)))
if ( (!relay) || (!(viaPath = relay->getAppropriatePath(now,false))) ) {
if (!(viaPath = peer->getAppropriatePath(now,true)))
return false;
}
}

View File

@ -299,7 +299,7 @@ public:
Address *a = (Address *)0;
SharedPtr<Peer> *p = (SharedPtr<Peer> *)0;
while (i.next(a,p)) {
const SharedPtr<Path> pp((*p)->getBestPath(now,false));
const SharedPtr<Path> pp((*p)->getAppropriatePath(now,false));
if (pp)
++cnt;
}

View File

@ -106,6 +106,26 @@ void Trace::peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,
}
}
void Trace::peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath)
{
ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is fully redundant",peer.address().toInt(),networkId);
}
void Trace::peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath)
{
ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is no longer redundant",peer.address().toInt(),networkId);
}
void Trace::peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer)
{
ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is balanced",peer.address().toInt(),networkId);
}
void Trace::peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer)
{
ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is unbalanced",peer.address().toInt(),networkId);
}
void Trace::peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId)
{
char tmp[128];

View File

@ -121,6 +121,12 @@ public:
void resettingPathsInScope(void *const tPtr,const Address &reporter,const InetAddress &reporterPhysicalAddress,const InetAddress &myPhysicalAddress,const InetAddress::IpScope scope);
void peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &path,const uint64_t packetId,const Packet::Verb verb);
void peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
void peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
void peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer);
void peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer);
void peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId);
void peerRedirected(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);

View File

@ -388,6 +388,7 @@ public:
_bindings[_bindingCount].udpSock = udps;
_bindings[_bindingCount].tcpListenSock = tcps;
_bindings[_bindingCount].address = ii->first;
phy.setIfName(udps, (char*)ii->second.c_str(), ii->second.length());
++_bindingCount;
}
} else {
@ -455,6 +456,20 @@ public:
return false;
}
/**
* Get a list of socket pointers for all bindings.
*
* @return A list of socket pointers for current bindings
*/
inline std::vector<PhySocket*> getBoundSockets()
{
std::vector<PhySocket*> sockets;
for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) {
sockets.push_back(_bindings[i].udpSock);
}
return sockets;
}
private:
_Binding _bindings[ZT_BINDER_MAX_BINDINGS];
std::atomic<unsigned int> _bindingCount;

View File

@ -27,6 +27,8 @@
#ifndef ZT_PHY_HPP
#define ZT_PHY_HPP
#include "../osdep/OSUtils.hpp"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -86,6 +88,22 @@ namespace ZeroTier {
*/
typedef void PhySocket;
struct link_test_record
{
link_test_record(PhySocket *_s, uint64_t _id, uint64_t _egress_time, uint32_t _length) :
s(_s),
id(_id),
egress_time(_egress_time),
length(_length)
{
//
}
PhySocket *s;
uint64_t id;
uint64_t egress_time;
uint32_t length;
};
/**
* Simple templated non-blocking sockets implementation
*
@ -154,10 +172,17 @@ private:
struct PhySocketImpl
{
PhySocketImpl() :
throughput(0)
{
memset(ifname, 0, sizeof(ifname));
}
PhySocketType type;
ZT_PHY_SOCKFD_TYPE sock;
void *uptr; // user-settable pointer
ZT_PHY_SOCKADDR_STORAGE_TYPE saddr; // remote for TCP_OUT and TCP_IN, local for TCP_LISTEN, RAW, and UDP
char ifname[16];
uint64_t throughput;
};
std::list<PhySocketImpl> _socks;
@ -173,6 +198,7 @@ private:
bool _noDelay;
bool _noCheck;
std::vector<struct link_test_record*> link_test_records;
public:
/**
@ -249,6 +275,173 @@ public:
*/
static inline void** getuptr(PhySocket *s) throw() { return &(reinterpret_cast<PhySocketImpl *>(s)->uptr); }
/**
* @param s Socket object
* @param nameBuf Buffer to store name of interface which this Socket object is bound to
* @param buflen Length of buffer to copy name into
*/
static inline void getIfName(PhySocket *s, char *nameBuf, int buflen)
{
memcpy(nameBuf, reinterpret_cast<PhySocketImpl *>(s)->ifname, buflen);
}
/**
* @param s Socket object
* @param ifname Buffer containing name of interface that this Socket object is bound to
* @param len Length of name of interface
*/
static inline void setIfName(PhySocket *s, char *ifname, int len)
{
memcpy(&(reinterpret_cast<PhySocketImpl *>(s)->ifname), ifname, len);
}
/**
* Get result of most recent throughput test
*
* @param s Socket object
*/
inline uint64_t getThroughput(PhySocket *s)
{
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
return sws ? sws->throughput : 0;
}
/**
* Whether or not the socket object is in a closed state
*
* @param s Socket object
* @return true if socket is closed, false if otherwise
*/
inline bool isClosed(PhySocket *s)
{
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
return sws->type == ZT_PHY_SOCKET_CLOSED;
}
/**
* Get state of socket object
*
* @param s Socket object
* @return State of socket
*/
inline int getState(PhySocket *s)
{
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
return sws->type;
}
/**
* In the event that this socket is erased, we need a way to convey to the multipath logic
* that this path is no longer valid.
*
* @param s Socket object
* @return Whether the state of this socket is within an acceptable range of values
*/
inline bool isValidState(PhySocket *s)
{
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
return sws->type >= ZT_PHY_SOCKET_CLOSED && sws->type <= ZT_PHY_SOCKET_UNIX_LISTEN;
}
/**
* Send a datagram of a known size to a selected peer and record egress time. The peer
* shall eventually respond by echoing back a smaller datagram.
*
* @param s Socket object
* @param remoteAddress Address of remote peer to receive link test packet
* @param data Buffer containing random packet data
* @param len Length of packet data buffer
* @return Number of bytes successfully written to socket
*/
inline int test_link_speed(PhySocket *s, const struct sockaddr *to, void *data, uint32_t len) {
if (!reinterpret_cast<PhySocketImpl *>(s)) {
return 0;
}
uint64_t *buf = (uint64_t*)data;
uint64_t id = buf[0];
if (to->sa_family != AF_INET && to->sa_family != AF_INET6) {
return 0;
}
uint64_t egress_time = OSUtils::now();
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
#if defined(_WIN32) || defined(_WIN64)
return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len);
#else
int w = ::sendto(sws->sock,data,len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in));
#endif
if (w > 0) {
link_test_records.push_back(new link_test_record(s, id, egress_time, len));
}
return w;
}
/**
* Remove link speed test records which have timed-out and record a 0 bits/s measurement
*/
inline void refresh_link_speed_records()
{
for(int i=0;i<link_test_records.size();i++) {
if(OSUtils::now() - link_test_records[i]->egress_time > ZT_LINK_TEST_TIMEOUT) {
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(link_test_records[i]->s));
if (sws) {
sws->throughput = 0;
}
link_test_records.erase(link_test_records.begin() + i);
}
}
}
/**
* Upon receipt of a link speed test datagram we echo back only the identification portion
*
* @param s Socket object
* @param from Address of remote peer that sent this datagram
* @param data Buffer containing datagram's contents
* @param len Length of datagram
* @return Number of bytes successfully written to socket in response
*/
inline int respond_to_link_test(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) {
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
uint64_t *id = (uint64_t*)data;
#if defined(_WIN32) || defined(_WIN64)
return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len);
#else
int w = ::sendto(sws->sock,id,sizeof(id[0]),0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in));
#endif
return w;
}
/**
* Upon receipt of a response to our original link test datagram, correlate this new datagram with the record
* of the one we sent. Compute the transit time and update the throughput field of the relevant socket. This
* value will later be read by the path quality estimation logic located in Path.hpp.
*
* @param s Socket object
* @param from Address of remote peer that sent this datagram
* @param data Buffer containing datagram contents (ID of original link test datagram)
* @param len Length of datagram
* @return true if datagram correponded to previous record, false if otherwise
*/
inline bool handle_link_test_response(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) {
uint64_t *id = (uint64_t*)data;
for(int i=0;i<link_test_records.size();i++) {
if(link_test_records[i]->id == id[0]) {
float rtt = (OSUtils::now()-link_test_records[i]->egress_time) / (float)1000; // s
uint32_t sz = (link_test_records[i]->length) * 8; // bits
float transit_time = rtt / 2.0;
int64_t raw = sz / transit_time;
PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
if (sws) {
sws->throughput = raw;
}
delete link_test_records[i];
link_test_records.erase(link_test_records.begin() + i);
return true;
}
}
return false;
}
/**
* Cause poll() to stop waiting immediately
*
@ -985,7 +1178,7 @@ public:
ZT_PHY_SOCKFD_TYPE sock = s->sock; // if closed, s->sock becomes invalid as s is no longer dereferencable
if ((FD_ISSET(sock,&wfds))&&(FD_ISSET(sock,&_writefds))) {
try {
_handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr),false);
_handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr));
} catch ( ... ) {}
}
if (FD_ISSET(sock,&rfds)) {

View File

@ -37,6 +37,7 @@
#include "../version.h"
#include "../include/ZeroTierOne.h"
#include "../include/ZeroTierDebug.h"
#include "../node/Constants.hpp"
#include "../node/Mutex.hpp"
@ -417,6 +418,7 @@ public:
PhySocket *_localControlSocket6;
bool _updateAutoApply;
bool _allowTcpFallbackRelay;
unsigned int _multipathMode;
unsigned int _primaryPort;
volatile unsigned int _udpPortPickerCounter;
@ -455,6 +457,9 @@ public:
// Last potential sleep/wake event
uint64_t _lastRestart;
// Last time link throughput was tested
uint64_t _lastLinkSpeedTest;
// Deadline for the next background task service function
volatile int64_t _nextBackgroundTaskDeadline;
@ -818,6 +823,7 @@ public:
_lastRestart = clockShouldBe;
int64_t lastTapMulticastGroupCheck = 0;
int64_t lastBindRefresh = 0;
int64_t lastMultipathModeUpdate = 0;
int64_t lastUpdateCheck = clockShouldBe;
int64_t lastCleanedPeersDb = 0;
int64_t lastLocalInterfaceAddressCheck = (clockShouldBe - ZT_LOCAL_INTERFACE_CHECK_INTERVAL) + 15000; // do this in 15s to give portmapper time to configure and other things time to settle
@ -849,8 +855,9 @@ public:
_updater->apply();
}
// Refresh bindings in case device's interfaces have changed, and also sync routes to update any shadow routes (e.g. shadow default)
if (((now - lastBindRefresh) >= ZT_BINDER_REFRESH_PERIOD)||(restarted)) {
// Refresh bindings
int interfaceRefreshPeriod = _multipathMode ? ZT_MULTIPATH_BINDER_REFRESH_PERIOD : ZT_BINDER_REFRESH_PERIOD;
if (((now - lastBindRefresh) >= interfaceRefreshPeriod)||(restarted)) {
lastBindRefresh = now;
unsigned int p[3];
unsigned int pc = 0;
@ -867,6 +874,34 @@ public:
}
}
}
// Update multipath mode (if needed)
if (((now - lastMultipathModeUpdate) >= interfaceRefreshPeriod)||(restarted)) {
lastMultipathModeUpdate = now;
_node->setMultipathMode(_multipathMode);
}
// Test link speeds
// TODO: This logic should eventually find its way into the core or as part of a passive
// measure within the protocol.
if (_multipathMode && ((now - _lastLinkSpeedTest) >= ZT_LINK_SPEED_TEST_INTERVAL)) {
_phy.refresh_link_speed_records();
_lastLinkSpeedTest = now;
// Generate random data to fill UDP packet
uint64_t pktBuf[ZT_LINK_TEST_DATAGRAM_SZ / sizeof(uint64_t)];
Utils::getSecureRandom(pktBuf, ZT_LINK_TEST_DATAGRAM_SZ);
ZT_PeerList *pl = _node->peers();
// get bindings (specifically just the sockets)
std::vector<PhySocket*> sockets = _binder.getBoundSockets();
// interfaces
for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) {
for(int j=0;j<pl->peerCount;++j) {
for (int k=0; k<(ZT_MAX_PEER_NETWORK_PATHS/4); k++) {
Utils::getSecureRandom(pktBuf, 8); // generate one random integer for unique id
_phy.test_link_speed(sockets[i], (struct sockaddr*)&(pl->peers[j].paths[k].address), pktBuf, ZT_LINK_TEST_DATAGRAM_SZ);
}
}
}
}
// Run background task processor in core if it's time to do so
int64_t dl = _nextBackgroundTaskDeadline;
@ -1190,6 +1225,7 @@ public:
json &settings = res["config"]["settings"];
settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay);
settings["multipathMode"] = OSUtils::jsonInt(settings["multipathMode"],_multipathMode);
#ifdef ZT_USE_MINIUPNPC
settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true);
#else
@ -1518,6 +1554,11 @@ public:
_primaryPort = (unsigned int)OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
_allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true);
_multipathMode = OSUtils::jsonInt(settings["multipathMode"],0);
if (_multipathMode != 0 && _allowTcpFallbackRelay) {
fprintf(stderr,"WARNING: multipathMode cannot be used with allowTcpFallbackRelay. Disabling allowTcpFallbackRelay");
_allowTcpFallbackRelay = false;
}
_portMappingEnabled = OSUtils::jsonBool(settings["portMappingEnabled"],true);
json &ignoreIfs = settings["interfacePrefixBlacklist"];
@ -1758,6 +1799,15 @@ public:
inline void phyOnDatagram(PhySocket *sock,void **uptr,const struct sockaddr *localAddr,const struct sockaddr *from,void *data,unsigned long len)
{
if (_multipathMode) {
// Handle link test packets (should eventually be moved into the protocol itself)
if (len == ZT_LINK_TEST_DATAGRAM_SZ) {
_phy.respond_to_link_test(sock, from, data, len);
}
if (len == ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ) {
_phy.handle_link_test_response(sock, from, data, len);
}
}
if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL))
_lastDirectReceiveFromGlobal = OSUtils::now();
const ZT_ResultCode rc = _node->processWirePacket(
@ -2007,7 +2057,7 @@ public:
inline void phyOnUnixAccept(PhySocket *sockL,PhySocket *sockN,void **uptrL,void **uptrN) {}
inline void phyOnUnixClose(PhySocket *sock,void **uptr) {}
inline void phyOnUnixData(PhySocket *sock,void **uptr,void *data,unsigned long len) {}
inline void phyOnUnixWritable(PhySocket *sock,void **uptr,bool lwip_invoked) {}
inline void phyOnUnixWritable(PhySocket *sock,void **uptr) {}
inline int nodeVirtualNetworkConfigFunction(uint64_t nwid,void **nuptr,enum ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nwc)
{