/* * ZeroTier One - Network Virtualization Everywhere * Copyright (C) 2011-2015 ZeroTier, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * -- * * ZeroTier may be used and distributed under the terms of the GPLv3, which * are available at: http://www.gnu.org/licenses/gpl-3.0.html * * If you would like to embed ZeroTier into a commercial application or * redistribute it in a modified binary form, please contact ZeroTier Networks * LLC. Start here: http://www.zerotier.com/ */ #ifdef ZT_ENABLE_CLUSTER #include #include #include #include #include #include #include #include #include #include #include #include "../version.h" #include "Cluster.hpp" #include "RuntimeEnvironment.hpp" #include "MulticastGroup.hpp" #include "CertificateOfMembership.hpp" #include "Salsa20.hpp" #include "Poly1305.hpp" #include "Identity.hpp" #include "Topology.hpp" #include "Packet.hpp" #include "Switch.hpp" #include "Node.hpp" #include "Array.hpp" /** * Chunk size for allocating queue entries * * Queue entries are allocated in chunks of this many and are added to a pool. * ZT_CLUSTER_MAX_QUEUE_GLOBAL must be evenly divisible by this. */ #define ZT_CLUSTER_QUEUE_CHUNK_SIZE 32 /** * Maximum number of chunks to ever allocate * * This is a global sanity limit to prevent resource exhaustion attacks. It * works out to about 600mb of RAM. You'll never see this on a normal edge * node. We're unlikely to see this on a root server unless someone is DOSing * us. In that case cluster relaying will be affected but other functions * should continue to operate normally. */ #define ZT_CLUSTER_MAX_QUEUE_CHUNKS 8194 /** * Max data per queue entry * * If we ever support larger transport MTUs this must be increased. The plus * 16 is just a small margin and has no special meaning. */ #define ZT_CLUSTER_SEND_QUEUE_DATA_MAX (ZT_UDP_DEFAULT_PAYLOAD_MTU + 16) namespace ZeroTier { static inline double _dist3d(int x1,int y1,int z1,int x2,int y2,int z2) throw() { double dx = ((double)x2 - (double)x1); double dy = ((double)y2 - (double)y1); double dz = ((double)z2 - (double)z1); return sqrt((dx * dx) + (dy * dy) + (dz * dz)); } // An entry in _ClusterSendQueue struct _ClusterSendQueueEntry { uint64_t timestamp; Address fromPeerAddress; Address toPeerAddress; // if we ever support larger transport MTUs this must be increased unsigned char data[ZT_CLUSTER_SEND_QUEUE_DATA_MAX]; unsigned int len; bool unite; }; // A multi-index map with entry memory pooling -- this allows our queue to // be O(log(N)) and is complex enough that it makes the code a lot cleaner // to break it out from Cluster. class _ClusterSendQueue { public: _ClusterSendQueue() : _poolCount(0) {} ~_ClusterSendQueue() {} // memory is automatically freed when _chunks is destroyed inline void enqueue(uint64_t now,const Address &from,const Address &to,const void *data,unsigned int len,bool unite) { if (len > ZT_CLUSTER_SEND_QUEUE_DATA_MAX) return; Mutex::Lock _l(_lock); // Delete oldest queue entry for this sender if this enqueue() would take them over the per-sender limit { std::set< std::pair >::iterator qi(_bySrc.lower_bound(std::pair(from,(_ClusterSendQueueEntry *)0))); std::set< std::pair >::iterator oldest(qi); unsigned long countForSender = 0; while ((qi != _bySrc.end())&&(qi->first == from)) { if (qi->second->timestamp < oldest->second->timestamp) oldest = qi; ++countForSender; ++qi; } if (countForSender >= ZT_CLUSTER_MAX_QUEUE_PER_SENDER) { _byDest.erase(std::pair(oldest->second->toPeerAddress,oldest->second)); _pool[_poolCount++] = oldest->second; _bySrc.erase(oldest); } } _ClusterSendQueueEntry *e; if (_poolCount > 0) { e = _pool[--_poolCount]; } else { if (_chunks.size() >= ZT_CLUSTER_MAX_QUEUE_CHUNKS) return; // queue is totally full! _chunks.push_back(Array<_ClusterSendQueueEntry,ZT_CLUSTER_QUEUE_CHUNK_SIZE>()); e = &(_chunks.back().data[0]); for(unsigned int i=1;itimestamp = now; e->fromPeerAddress = from; e->toPeerAddress = to; memcpy(e->data,data,len); e->len = len; e->unite = unite; _bySrc.insert(std::pair(from,e)); _byDest.insert(std::pair(to,e)); } inline void expire(uint64_t now) { Mutex::Lock _l(_lock); for(std::set< std::pair >::iterator qi(_bySrc.begin());qi!=_bySrc.end();) { if ((now - qi->second->timestamp) > ZT_CLUSTER_QUEUE_EXPIRATION) { _byDest.erase(std::pair(qi->second->toPeerAddress,qi->second)); _pool[_poolCount++] = qi->second; _bySrc.erase(qi++); } else ++qi; } } /** * Get and dequeue entries for a given destination address * * After use these entries must be returned with returnToPool()! * * @param dest Destination address * @param results Array to fill with results * @param maxResults Size of results[] in pointers * @return Number of actual results returned */ inline unsigned int getByDest(const Address &dest,_ClusterSendQueueEntry **results,unsigned int maxResults) { unsigned int count = 0; Mutex::Lock _l(_lock); std::set< std::pair >::iterator qi(_byDest.lower_bound(std::pair(dest,(_ClusterSendQueueEntry *)0))); while ((qi != _byDest.end())&&(qi->first == dest)) { _bySrc.erase(std::pair(qi->second->fromPeerAddress,qi->second)); results[count++] = qi->second; if (count == maxResults) break; _byDest.erase(qi++); } return count; } /** * Return entries to pool after use * * @param entries Array of entries * @param count Number of entries */ inline void returnToPool(_ClusterSendQueueEntry **entries,unsigned int count) { Mutex::Lock _l(_lock); for(unsigned int i=0;i > _chunks; _ClusterSendQueueEntry *_pool[ZT_CLUSTER_QUEUE_CHUNK_SIZE * ZT_CLUSTER_MAX_QUEUE_CHUNKS]; unsigned long _poolCount; std::set< std::pair > _bySrc; std::set< std::pair > _byDest; Mutex _lock; }; Cluster::Cluster( const RuntimeEnvironment *renv, uint16_t id, const std::vector &zeroTierPhysicalEndpoints, int32_t x, int32_t y, int32_t z, void (*sendFunction)(void *,unsigned int,const void *,unsigned int), void *sendFunctionArg, int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *), void *addressToLocationFunctionArg) : RR(renv), _sendQueue(new _ClusterSendQueue()), _sendFunction(sendFunction), _sendFunctionArg(sendFunctionArg), _addressToLocationFunction(addressToLocationFunction), _addressToLocationFunctionArg(addressToLocationFunctionArg), _x(x), _y(y), _z(z), _id(id), _zeroTierPhysicalEndpoints(zeroTierPhysicalEndpoints), _members(new _Member[ZT_CLUSTER_MAX_MEMBERS]), _lastFlushed(0), _lastCleanedRemotePeers(0), _lastCleanedQueue(0) { uint16_t stmp[ZT_SHA512_DIGEST_LEN / sizeof(uint16_t)]; // Generate master secret by hashing the secret from our Identity key pair RR->identity.sha512PrivateKey(_masterSecret); // Generate our inbound message key, which is the master secret XORed with our ID and hashed twice memcpy(stmp,_masterSecret,sizeof(stmp)); stmp[0] ^= Utils::hton(id); SHA512::hash(stmp,stmp,sizeof(stmp)); SHA512::hash(stmp,stmp,sizeof(stmp)); memcpy(_key,stmp,sizeof(_key)); Utils::burn(stmp,sizeof(stmp)); } Cluster::~Cluster() { Utils::burn(_masterSecret,sizeof(_masterSecret)); Utils::burn(_key,sizeof(_key)); delete [] _members; delete _sendQueue; } void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len) { Buffer dmsg; { // FORMAT: <[16] iv><[8] MAC><... data> if ((len < 24)||(len > ZT_CLUSTER_MAX_MESSAGE_LENGTH)) return; // 16-byte IV: first 8 bytes XORed with key, last 8 bytes used as Salsa20 64-bit IV char keytmp[32]; memcpy(keytmp,_key,32); for(int i=0;i<8;++i) keytmp[i] ^= reinterpret_cast(msg)[i]; Salsa20 s20(keytmp,256,reinterpret_cast(msg) + 8); Utils::burn(keytmp,sizeof(keytmp)); // One-time-use Poly1305 key from first 32 bytes of Salsa20 keystream (as per DJB/NaCl "standard") char polykey[ZT_POLY1305_KEY_LEN]; memset(polykey,0,sizeof(polykey)); s20.encrypt12(polykey,polykey,sizeof(polykey)); // Compute 16-byte MAC char mac[ZT_POLY1305_MAC_LEN]; Poly1305::compute(mac,reinterpret_cast(msg) + 24,len - 24,polykey); // Check first 8 bytes of MAC against 64-bit MAC in stream if (!Utils::secureEq(mac,reinterpret_cast(msg) + 16,8)) return; // Decrypt! dmsg.setSize(len - 24); s20.decrypt12(reinterpret_cast(msg) + 24,const_cast(dmsg.data()),dmsg.size()); } if (dmsg.size() < 4) return; const uint16_t fromMemberId = dmsg.at(0); unsigned int ptr = 2; if (fromMemberId == _id) // sanity check: we don't talk to ourselves return; const uint16_t toMemberId = dmsg.at(ptr); ptr += 2; if (toMemberId != _id) // sanity check: message not for us? return; { // make sure sender is actually considered a member Mutex::Lock _l3(_memberIds_m); if (std::find(_memberIds.begin(),_memberIds.end(),fromMemberId) == _memberIds.end()) return; } try { while (ptr < dmsg.size()) { const unsigned int mlen = dmsg.at(ptr); ptr += 2; const unsigned int nextPtr = ptr + mlen; if (nextPtr > dmsg.size()) break; int mtype = -1; try { switch((StateMessageType)(mtype = (int)dmsg[ptr++])) { default: break; case CLUSTER_MESSAGE_ALIVE: { _Member &m = _members[fromMemberId]; Mutex::Lock mlck(m.lock); ptr += 7; // skip version stuff, not used yet m.x = dmsg.at(ptr); ptr += 4; m.y = dmsg.at(ptr); ptr += 4; m.z = dmsg.at(ptr); ptr += 4; ptr += 8; // skip local clock, not used m.load = dmsg.at(ptr); ptr += 8; m.peers = dmsg.at(ptr); ptr += 8; ptr += 8; // skip flags, unused #ifdef ZT_TRACE std::string addrs; #endif unsigned int physicalAddressCount = dmsg[ptr++]; m.zeroTierPhysicalEndpoints.clear(); for(unsigned int i=0;i 0) addrs.push_back(','); addrs.append(m.zeroTierPhysicalEndpoints.back().toString()); } #endif } #ifdef ZT_TRACE if ((RR->node->now() - m.lastReceivedAliveAnnouncement) >= ZT_CLUSTER_TIMEOUT) { TRACE("[%u] I'm alive! peers close to %d,%d,%d can be redirected to: %s",(unsigned int)fromMemberId,m.x,m.y,m.z,addrs.c_str()); } #endif m.lastReceivedAliveAnnouncement = RR->node->now(); } break; case CLUSTER_MESSAGE_HAVE_PEER: { Identity id; ptr += id.deserialize(dmsg,ptr); if (id) { RR->topology->saveIdentity(id); { Mutex::Lock _l(_remotePeers_m); _remotePeers[std::pair(id.address(),(unsigned int)fromMemberId)] = RR->node->now(); } _ClusterSendQueueEntry *q[16384]; // 16384 is "tons" unsigned int qc = _sendQueue->getByDest(id.address(),q,16384); for(unsigned int i=0;isendViaCluster(q[i]->fromPeerAddress,q[i]->toPeerAddress,q[i]->data,q[i]->len,q[i]->unite); _sendQueue->returnToPool(q,qc); TRACE("[%u] has %s (retried %u queued sends)",(unsigned int)fromMemberId,id.address().toString().c_str(),qc); } } break; case CLUSTER_MESSAGE_WANT_PEER: { const Address zeroTierAddress(dmsg.field(ptr,ZT_ADDRESS_LENGTH),ZT_ADDRESS_LENGTH); ptr += ZT_ADDRESS_LENGTH; SharedPtr peer(RR->topology->getPeerNoCache(zeroTierAddress)); if ( (peer) && (peer->hasClusterOptimalPath(RR->node->now())) ) { Buffer<1024> buf; peer->identity().serialize(buf); Mutex::Lock _l2(_members[fromMemberId].lock); _send(fromMemberId,CLUSTER_MESSAGE_HAVE_PEER,buf.data(),buf.size()); } } break; case CLUSTER_MESSAGE_REMOTE_PACKET: { const unsigned int plen = dmsg.at(ptr); ptr += 2; if (plen) { Packet remotep(dmsg.field(ptr,plen),plen); ptr += plen; //TRACE("remote %s from %s via %u (%u bytes)",Packet::verbString(remotep.verb()),remotep.source().toString().c_str(),fromMemberId,plen); switch(remotep.verb()) { case Packet::VERB_WHOIS: _doREMOTE_WHOIS(fromMemberId,remotep); break; case Packet::VERB_MULTICAST_GATHER: _doREMOTE_MULTICAST_GATHER(fromMemberId,remotep); break; default: break; // ignore things we don't care about across cluster } } } break; case CLUSTER_MESSAGE_PROXY_UNITE: { const Address localPeerAddress(dmsg.field(ptr,ZT_ADDRESS_LENGTH),ZT_ADDRESS_LENGTH); ptr += ZT_ADDRESS_LENGTH; const Address remotePeerAddress(dmsg.field(ptr,ZT_ADDRESS_LENGTH),ZT_ADDRESS_LENGTH); ptr += ZT_ADDRESS_LENGTH; const unsigned int numRemotePeerPaths = dmsg[ptr++]; InetAddress remotePeerPaths[256]; // size is 8-bit, so 256 is max for(unsigned int i=0;inode->now(); SharedPtr localPeer(RR->topology->getPeerNoCache(localPeerAddress)); if ((localPeer)&&(numRemotePeerPaths > 0)) { InetAddress bestLocalV4,bestLocalV6; localPeer->getBestActiveAddresses(now,bestLocalV4,bestLocalV6); InetAddress bestRemoteV4,bestRemoteV6; for(unsigned int i=0;iidentity.address(),Packet::VERB_RENDEZVOUS); rendezvousForLocal.append((uint8_t)0); remotePeerAddress.appendTo(rendezvousForLocal); Buffer<2048> rendezvousForRemote; remotePeerAddress.appendTo(rendezvousForRemote); rendezvousForRemote.append((uint8_t)Packet::VERB_RENDEZVOUS); rendezvousForRemote.addSize(2); // space for actual packet payload length rendezvousForRemote.append((uint8_t)0); // flags == 0 localPeerAddress.appendTo(rendezvousForRemote); bool haveMatch = false; if ((bestLocalV6)&&(bestRemoteV6)) { haveMatch = true; rendezvousForLocal.append((uint16_t)bestRemoteV6.port()); rendezvousForLocal.append((uint8_t)16); rendezvousForLocal.append(bestRemoteV6.rawIpData(),16); rendezvousForRemote.append((uint16_t)bestLocalV6.port()); rendezvousForRemote.append((uint8_t)16); rendezvousForRemote.append(bestLocalV6.rawIpData(),16); rendezvousForRemote.setAt(ZT_ADDRESS_LENGTH + 1,(uint16_t)(9 + 16)); } else if ((bestLocalV4)&&(bestRemoteV4)) { haveMatch = true; rendezvousForLocal.append((uint16_t)bestRemoteV4.port()); rendezvousForLocal.append((uint8_t)4); rendezvousForLocal.append(bestRemoteV4.rawIpData(),4); rendezvousForRemote.append((uint16_t)bestLocalV4.port()); rendezvousForRemote.append((uint8_t)4); rendezvousForRemote.append(bestLocalV4.rawIpData(),4); rendezvousForRemote.setAt(ZT_ADDRESS_LENGTH + 1,(uint16_t)(9 + 4)); } if (haveMatch) { { Mutex::Lock _l2(_members[fromMemberId].lock); _send(fromMemberId,CLUSTER_MESSAGE_PROXY_SEND,rendezvousForRemote.data(),rendezvousForRemote.size()); } RR->sw->send(rendezvousForLocal,true,0); } } } break; case CLUSTER_MESSAGE_PROXY_SEND: { const Address rcpt(dmsg.field(ptr,ZT_ADDRESS_LENGTH),ZT_ADDRESS_LENGTH); ptr += ZT_ADDRESS_LENGTH; const Packet::Verb verb = (Packet::Verb)dmsg[ptr++]; const unsigned int len = dmsg.at(ptr); ptr += 2; Packet outp(rcpt,RR->identity.address(),verb); outp.append(dmsg.field(ptr,len),len); ptr += len; RR->sw->send(outp,true,0); //TRACE("[%u] proxy send %s to %s length %u",(unsigned int)fromMemberId,Packet::verbString(verb),rcpt.toString().c_str(),len); } break; } } catch ( ... ) { TRACE("invalid message of size %u type %d (inner decode), discarding",mlen,mtype); // drop invalids } ptr = nextPtr; } } catch ( ... ) { TRACE("invalid message (outer loop), discarding"); // drop invalids } } void Cluster::broadcastHavePeer(const Identity &id) { Buffer<1024> buf; id.serialize(buf); Mutex::Lock _l(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { Mutex::Lock _l2(_members[*mid].lock); _send(*mid,CLUSTER_MESSAGE_HAVE_PEER,buf.data(),buf.size()); } } void Cluster::sendViaCluster(const Address &fromPeerAddress,const Address &toPeerAddress,const void *data,unsigned int len,bool unite) { if (len > ZT_PROTO_MAX_PACKET_LENGTH) // sanity check return; const uint64_t now = RR->node->now(); uint64_t mostRecentTs = 0; unsigned int mostRecentMemberId = 0xffffffff; { Mutex::Lock _l2(_remotePeers_m); std::map< std::pair,uint64_t >::const_iterator rpe(_remotePeers.lower_bound(std::pair(toPeerAddress,0))); for(;;) { if ((rpe == _remotePeers.end())||(rpe->first.first != toPeerAddress)) break; else if (rpe->second > mostRecentTs) { mostRecentTs = rpe->second; mostRecentMemberId = rpe->first.second; } ++rpe; } } const uint64_t age = now - mostRecentTs; if (age >= (ZT_PEER_ACTIVITY_TIMEOUT / 3)) { const bool enqueueAndWait = ((age >= ZT_PEER_ACTIVITY_TIMEOUT)||(mostRecentMemberId > 0xffff)); // Poll everyone with WANT_PEER if the age of our most recent entry is // approaching expiration (or has expired, or does not exist). char tmp[ZT_ADDRESS_LENGTH]; toPeerAddress.copyTo(tmp,ZT_ADDRESS_LENGTH); { Mutex::Lock _l(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { Mutex::Lock _l2(_members[*mid].lock); _send(*mid,CLUSTER_MESSAGE_WANT_PEER,tmp,ZT_ADDRESS_LENGTH); } } // If there isn't a good place to send via, then enqueue this for retrying // later and return after having broadcasted a WANT_PEER. if (enqueueAndWait) { TRACE("sendViaCluster %s -> %s enqueueing to wait for HAVE_PEER",fromPeerAddress.toString().c_str(),toPeerAddress.toString().c_str()); _sendQueue->enqueue(now,fromPeerAddress,toPeerAddress,data,len,unite); return; } } Buffer<1024> buf; if (unite) { InetAddress v4,v6; if (fromPeerAddress) { SharedPtr fromPeer(RR->topology->getPeerNoCache(fromPeerAddress)); if (fromPeer) fromPeer->getBestActiveAddresses(now,v4,v6); } uint8_t addrCount = 0; if (v4) ++addrCount; if (v6) ++addrCount; if (addrCount) { toPeerAddress.appendTo(buf); fromPeerAddress.appendTo(buf); buf.append(addrCount); if (v4) v4.serialize(buf); if (v6) v6.serialize(buf); } } { Mutex::Lock _l2(_members[mostRecentMemberId].lock); if (buf.size() > 0) _send(mostRecentMemberId,CLUSTER_MESSAGE_PROXY_UNITE,buf.data(),buf.size()); if (_members[mostRecentMemberId].zeroTierPhysicalEndpoints.size() > 0) { TRACE("sendViaCluster relaying %u bytes from %s to %s by way of %u",len,fromPeerAddress.toString().c_str(),toPeerAddress.toString().c_str(),(unsigned int)mostRecentMemberId); RR->node->putPacket(InetAddress(),_members[mostRecentMemberId].zeroTierPhysicalEndpoints.front(),data,len); } } } void Cluster::sendDistributedQuery(const Packet &pkt) { Buffer<4096> buf; buf.append((uint16_t)pkt.size()); buf.append(pkt.data(),pkt.size()); Mutex::Lock _l(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { Mutex::Lock _l2(_members[*mid].lock); _send(*mid,CLUSTER_MESSAGE_REMOTE_PACKET,buf.data(),buf.size()); } } void Cluster::doPeriodicTasks() { const uint64_t now = RR->node->now(); if ((now - _lastFlushed) >= ZT_CLUSTER_FLUSH_PERIOD) { _lastFlushed = now; Mutex::Lock _l(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { Mutex::Lock _l2(_members[*mid].lock); if ((now - _members[*mid].lastAnnouncedAliveTo) >= ((ZT_CLUSTER_TIMEOUT / 2) - 1000)) { _members[*mid].lastAnnouncedAliveTo = now; Buffer<2048> alive; alive.append((uint16_t)ZEROTIER_ONE_VERSION_MAJOR); alive.append((uint16_t)ZEROTIER_ONE_VERSION_MINOR); alive.append((uint16_t)ZEROTIER_ONE_VERSION_REVISION); alive.append((uint8_t)ZT_PROTO_VERSION); if (_addressToLocationFunction) { alive.append((int32_t)_x); alive.append((int32_t)_y); alive.append((int32_t)_z); } else { alive.append((int32_t)0); alive.append((int32_t)0); alive.append((int32_t)0); } alive.append((uint64_t)now); alive.append((uint64_t)0); // TODO: compute and send load average alive.append((uint64_t)RR->topology->countActive()); alive.append((uint64_t)0); // unused/reserved flags alive.append((uint8_t)_zeroTierPhysicalEndpoints.size()); for(std::vector::const_iterator pe(_zeroTierPhysicalEndpoints.begin());pe!=_zeroTierPhysicalEndpoints.end();++pe) pe->serialize(alive); _send(*mid,CLUSTER_MESSAGE_ALIVE,alive.data(),alive.size()); } _flush(*mid); } } if ((now - _lastCleanedRemotePeers) >= (ZT_PEER_ACTIVITY_TIMEOUT * 2)) { _lastCleanedRemotePeers = now; Mutex::Lock _l(_remotePeers_m); for(std::map< std::pair,uint64_t >::iterator rp(_remotePeers.begin());rp!=_remotePeers.end();) { if ((now - rp->second) >= ZT_PEER_ACTIVITY_TIMEOUT) _remotePeers.erase(rp++); else ++rp; } } if ((now - _lastCleanedQueue) >= ZT_CLUSTER_QUEUE_EXPIRATION) { _lastCleanedQueue = now; _sendQueue->expire(now); } } void Cluster::addMember(uint16_t memberId) { if ((memberId >= ZT_CLUSTER_MAX_MEMBERS)||(memberId == _id)) return; Mutex::Lock _l2(_members[memberId].lock); { Mutex::Lock _l(_memberIds_m); if (std::find(_memberIds.begin(),_memberIds.end(),memberId) != _memberIds.end()) return; _memberIds.push_back(memberId); std::sort(_memberIds.begin(),_memberIds.end()); } _members[memberId].clear(); // Generate this member's message key from the master and its ID uint16_t stmp[ZT_SHA512_DIGEST_LEN / sizeof(uint16_t)]; memcpy(stmp,_masterSecret,sizeof(stmp)); stmp[0] ^= Utils::hton(memberId); SHA512::hash(stmp,stmp,sizeof(stmp)); SHA512::hash(stmp,stmp,sizeof(stmp)); memcpy(_members[memberId].key,stmp,sizeof(_members[memberId].key)); Utils::burn(stmp,sizeof(stmp)); // Prepare q _members[memberId].q.clear(); char iv[16]; Utils::getSecureRandom(iv,16); _members[memberId].q.append(iv,16); _members[memberId].q.addSize(8); // room for MAC _members[memberId].q.append((uint16_t)_id); _members[memberId].q.append((uint16_t)memberId); } void Cluster::removeMember(uint16_t memberId) { Mutex::Lock _l(_memberIds_m); std::vector newMemberIds; for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { if (*mid != memberId) newMemberIds.push_back(*mid); } _memberIds = newMemberIds; } bool Cluster::findBetterEndpoint(InetAddress &redirectTo,const Address &peerAddress,const InetAddress &peerPhysicalAddress,bool offload) { if (_addressToLocationFunction) { // Pick based on location if it can be determined int px = 0,py = 0,pz = 0; if (_addressToLocationFunction(_addressToLocationFunctionArg,reinterpret_cast(&peerPhysicalAddress),&px,&py,&pz) == 0) { TRACE("no geolocation data for %s (geo-lookup is lazy/async so it may work next time)",peerPhysicalAddress.toIpString().c_str()); return false; } // Find member closest to this peer const uint64_t now = RR->node->now(); std::vector best; const double currentDistance = _dist3d(_x,_y,_z,px,py,pz); double bestDistance = (offload ? 2147483648.0 : currentDistance); unsigned int bestMember = _id; { Mutex::Lock _l(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { _Member &m = _members[*mid]; Mutex::Lock _ml(m.lock); // Consider member if it's alive and has sent us a location and one or more physical endpoints to send peers to if ( ((now - m.lastReceivedAliveAnnouncement) < ZT_CLUSTER_TIMEOUT) && ((m.x != 0)||(m.y != 0)||(m.z != 0)) && (m.zeroTierPhysicalEndpoints.size() > 0) ) { const double mdist = _dist3d(m.x,m.y,m.z,px,py,pz); if (mdist < bestDistance) { bestDistance = mdist; bestMember = *mid; best = m.zeroTierPhysicalEndpoints; } } } } // Redirect to a closer member if it has a ZeroTier endpoint address in the same ss_family for(std::vector::const_iterator a(best.begin());a!=best.end();++a) { if (a->ss_family == peerPhysicalAddress.ss_family) { TRACE("%s at [%d,%d,%d] is %f from us but %f from %u, can redirect to %s",peerAddress.toString().c_str(),px,py,pz,currentDistance,bestDistance,bestMember,a->toString().c_str()); redirectTo = *a; return true; } } TRACE("%s at [%d,%d,%d] is %f from us, no better endpoints found",peerAddress.toString().c_str(),px,py,pz,currentDistance); return false; } else { // TODO: pick based on load if no location info? return false; } } void Cluster::status(ZT_ClusterStatus &status) const { const uint64_t now = RR->node->now(); memset(&status,0,sizeof(ZT_ClusterStatus)); status.myId = _id; { ZT_ClusterMemberStatus *const s = &(status.members[status.clusterSize++]); s->id = _id; s->alive = 1; s->x = _x; s->y = _y; s->z = _z; s->load = 0; // TODO s->peers = RR->topology->countActive(); for(std::vector::const_iterator ep(_zeroTierPhysicalEndpoints.begin());ep!=_zeroTierPhysicalEndpoints.end();++ep) { if (s->numZeroTierPhysicalEndpoints >= ZT_CLUSTER_MAX_ZT_PHYSICAL_ADDRESSES) // sanity check break; memcpy(&(s->zeroTierPhysicalEndpoints[s->numZeroTierPhysicalEndpoints++]),&(*ep),sizeof(struct sockaddr_storage)); } } { Mutex::Lock _l1(_memberIds_m); for(std::vector::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) { if (status.clusterSize >= ZT_CLUSTER_MAX_MEMBERS) // sanity check break; _Member &m = _members[*mid]; Mutex::Lock ml(m.lock); ZT_ClusterMemberStatus *const s = &(status.members[status.clusterSize++]); s->id = *mid; s->msSinceLastHeartbeat = (unsigned int)std::min((uint64_t)(~((unsigned int)0)),(now - m.lastReceivedAliveAnnouncement)); s->alive = (s->msSinceLastHeartbeat < ZT_CLUSTER_TIMEOUT) ? 1 : 0; s->x = m.x; s->y = m.y; s->z = m.z; s->load = m.load; s->peers = m.peers; for(std::vector::const_iterator ep(m.zeroTierPhysicalEndpoints.begin());ep!=m.zeroTierPhysicalEndpoints.end();++ep) { if (s->numZeroTierPhysicalEndpoints >= ZT_CLUSTER_MAX_ZT_PHYSICAL_ADDRESSES) // sanity check break; memcpy(&(s->zeroTierPhysicalEndpoints[s->numZeroTierPhysicalEndpoints++]),&(*ep),sizeof(struct sockaddr_storage)); } } } } void Cluster::_send(uint16_t memberId,StateMessageType type,const void *msg,unsigned int len) { if ((len + 3) > (ZT_CLUSTER_MAX_MESSAGE_LENGTH - (24 + 2 + 2))) // sanity check return; _Member &m = _members[memberId]; // assumes m.lock is locked! if ((m.q.size() + len + 3) > ZT_CLUSTER_MAX_MESSAGE_LENGTH) _flush(memberId); m.q.append((uint16_t)(len + 1)); m.q.append((uint8_t)type); m.q.append(msg,len); } void Cluster::_flush(uint16_t memberId) { _Member &m = _members[memberId]; // assumes m.lock is locked! if (m.q.size() > (24 + 2 + 2)) { // 16-byte IV + 8-byte MAC + 2 byte from-member-ID + 2 byte to-member-ID // Create key from member's key and IV char keytmp[32]; memcpy(keytmp,m.key,32); for(int i=0;i<8;++i) keytmp[i] ^= m.q[i]; Salsa20 s20(keytmp,256,m.q.field(8,8)); Utils::burn(keytmp,sizeof(keytmp)); // One-time-use Poly1305 key from first 32 bytes of Salsa20 keystream (as per DJB/NaCl "standard") char polykey[ZT_POLY1305_KEY_LEN]; memset(polykey,0,sizeof(polykey)); s20.encrypt12(polykey,polykey,sizeof(polykey)); // Encrypt m.q in place s20.encrypt12(reinterpret_cast(m.q.data()) + 24,const_cast(reinterpret_cast(m.q.data())) + 24,m.q.size() - 24); // Add MAC for authentication (encrypt-then-MAC) char mac[ZT_POLY1305_MAC_LEN]; Poly1305::compute(mac,reinterpret_cast(m.q.data()) + 24,m.q.size() - 24,polykey); memcpy(m.q.field(16,8),mac,8); // Send! _sendFunction(_sendFunctionArg,memberId,m.q.data(),m.q.size()); // Prepare for more m.q.clear(); char iv[16]; Utils::getSecureRandom(iv,16); m.q.append(iv,16); m.q.addSize(8); // room for MAC m.q.append((uint16_t)_id); // from member ID m.q.append((uint16_t)memberId); // to member ID } } void Cluster::_doREMOTE_WHOIS(uint64_t fromMemberId,const Packet &remotep) { if (remotep.payloadLength() >= ZT_ADDRESS_LENGTH) { Identity queried(RR->topology->getIdentity(Address(remotep.payload(),ZT_ADDRESS_LENGTH))); if (queried) { Buffer<1024> routp; remotep.source().appendTo(routp); routp.append((uint8_t)Packet::VERB_OK); routp.addSize(2); // space for length routp.append((uint8_t)Packet::VERB_WHOIS); routp.append(remotep.packetId()); queried.serialize(routp); routp.setAt(ZT_ADDRESS_LENGTH + 1,(uint16_t)(routp.size() - ZT_ADDRESS_LENGTH - 3)); TRACE("responding to remote WHOIS from %s @ %u with identity of %s",remotep.source().toString().c_str(),(unsigned int)fromMemberId,queried.address().toString().c_str()); Mutex::Lock _l2(_members[fromMemberId].lock); _send(fromMemberId,CLUSTER_MESSAGE_PROXY_SEND,routp.data(),routp.size()); } } } void Cluster::_doREMOTE_MULTICAST_GATHER(uint64_t fromMemberId,const Packet &remotep) { const uint64_t nwid = remotep.at(ZT_PROTO_VERB_MULTICAST_GATHER_IDX_NETWORK_ID); const MulticastGroup mg(MAC(remotep.field(ZT_PROTO_VERB_MULTICAST_GATHER_IDX_MAC,6),6),remotep.at(ZT_PROTO_VERB_MULTICAST_GATHER_IDX_ADI)); unsigned int gatherLimit = remotep.at(ZT_PROTO_VERB_MULTICAST_GATHER_IDX_GATHER_LIMIT); const Address remotePeerAddress(remotep.source()); if (gatherLimit) { Buffer routp; remotePeerAddress.appendTo(routp); routp.append((uint8_t)Packet::VERB_OK); routp.addSize(2); // space for length routp.append((uint8_t)Packet::VERB_MULTICAST_GATHER); routp.append(remotep.packetId()); routp.append(nwid); mg.mac().appendTo(routp); routp.append((uint32_t)mg.adi()); if (gatherLimit > ((ZT_CLUSTER_MAX_MESSAGE_LENGTH - 80) / 5)) gatherLimit = ((ZT_CLUSTER_MAX_MESSAGE_LENGTH - 80) / 5); if (RR->mc->gather(remotePeerAddress,nwid,mg,routp,gatherLimit)) { routp.setAt(ZT_ADDRESS_LENGTH + 1,(uint16_t)(routp.size() - ZT_ADDRESS_LENGTH - 3)); TRACE("responding to remote MULTICAST_GATHER from %s @ %u with %u bytes",remotePeerAddress.toString().c_str(),(unsigned int)fromMemberId,routp.size()); Mutex::Lock _l2(_members[fromMemberId].lock); _send(fromMemberId,CLUSTER_MESSAGE_PROXY_SEND,routp.data(),routp.size()); } } } } // namespace ZeroTier #endif // ZT_ENABLE_CLUSTER