From f19c3c51d3ca2bc886a9125aa9b187aa794b1676 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 2 Jul 2015 09:00:00 -0700 Subject: [PATCH 1/3] Revert slow non-SSE Salsa20 modification since it did not fix Android/ARM issue. Also update Salsa20 comments and clean up a bit. --- node/Salsa20.cpp | 44 +++++++++++++++++++++++--------------------- node/Salsa20.hpp | 2 +- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp index 2eb683810..ae8e18023 100644 --- a/node/Salsa20.cpp +++ b/node/Salsa20.cpp @@ -1,51 +1,53 @@ /* * Based on public domain code available at: http://cr.yp.to/snuffle.html * - * This therefore is public domain. + * Modifications and C-native SSE macro based SSE implementation by + * Adam Ierymenko . + * + * Since the original was public domain, this is too. */ -#include "Salsa20.hpp" #include "Constants.hpp" +#include "Salsa20.hpp" #define ROTATE(v,c) (((v) << (c)) | ((v) >> (32 - (c)))) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((uint32_t)((v) + (w))) +// Set up laod/store macros with appropriate endianness (we don't use these in SSE mode) #ifndef ZT_SALSA20_SSE #if __BYTE_ORDER == __LITTLE_ENDIAN -/* We have a slower version of these macros for CPU/compiler combos that - * do not allow unaligned access to a uint32_t. Another solution would be - * to methodically require alignment across the code, but this is quicker - * for now. The culprit appears to be some Android-based ARM devices. */ -#if 1 -#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) -static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) -{ - c[0] = (uint8_t)v; - c[1] = (uint8_t)(v >> 8); - c[2] = (uint8_t)(v >> 16); - c[3] = (uint8_t)(v >> 24); -} -#else +// Slow version that does not use type punning +//#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) +//static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); } + +// Fast version that just does 32-bit load/store #define U8TO32_LITTLE(p) (*((const uint32_t *)((const void *)(p)))) #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = (v) -#endif -#else // big endian +#else // __BYTE_ORDER == __BIG_ENDIAN (we don't support anything else... does MIDDLE_ENDIAN even still exist?) #ifdef __GNUC__ + +// Use GNUC builtin bswap macros on big-endian machines if available #define U8TO32_LITTLE(p) __builtin_bswap32(*((const uint32_t *)((const void *)(p)))) #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = __builtin_bswap32((v)) -#else // no bswap stuff... need to do it manually? -error need be; + +#else // no __GNUC__ + +// Otherwise do it the slow, manual way on BE machines +#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) +static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); } + #endif // __GNUC__ or not -#endif // little/big endian +#endif // __BYTE_ORDER little or big? #endif // !ZT_SALSA20_SSE +// Statically compute and define SSE constants #ifdef ZT_SALSA20_SSE class _s20sseconsts { diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp index 9631a6dba..3bb041ac4 100644 --- a/node/Salsa20.hpp +++ b/node/Salsa20.hpp @@ -78,7 +78,7 @@ public: } private: - volatile union { + union { #ifdef ZT_SALSA20_SSE __m128i v[4]; #endif // ZT_SALSA20_SSE From cfdf4e3a4928952124118be13f00af2528ab8b04 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 2 Jul 2015 09:13:56 -0700 Subject: [PATCH 2/3] GitHub issue #171 -- separate CFLAGS and CXXFLAGS in Linux makefile --- make-linux.mk | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/make-linux.mk b/make-linux.mk index cc8021d03..6881926b0 100644 --- a/make-linux.mk +++ b/make-linux.mk @@ -1,4 +1,23 @@ -# Pick clang or gcc, with preference for clang +# +# Makefile for ZeroTier One on Linux +# +# This is confirmed to work on distributions newer than CentOS 6 (the +# one used for reference builds) and on 32 and 64 bit x86 and ARM +# machines. It should also work on other 'normal' machines and recent +# distributions. Editing might be required for tiny devices or weird +# distros. +# +# Targets +# one: zerotier-one and symlinks (cli and idtool) +# all: builds 'one' +# selftest: zerotier-selftest +# debug: builds 'one' and 'selftest' with tracing and debug flags +# installer: ZeroTierOneInstaller-... and packages (if possible) +# official: builds 'one' and 'installer' +# clean: removes all built files, objects, other trash +# + +# Automagically pick clang or gcc, with preference for clang CC=$(shell if [ -e /usr/bin/clang ]; then echo clang; else echo gcc; fi) CXX=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi) @@ -25,24 +44,25 @@ endif ifeq ($(ZT_DEBUG),1) DEFS+=-DZT_TRACE CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) + CXXFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) LDFLAGS= STRIP=echo # The following line enables optimization for the crypto code, since - # C25519 in particular is almost UNUSABLE in heavy testing without it. + # C25519 in particular is almost UNUSABLE in -O0 even on a 3ghz box! ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS) else CFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS) + CXXFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -fno-rtti -pthread $(INCLUDES) -DNDEBUG $(DEFS) LDFLAGS=-pie -Wl,-z,relro,-z,now STRIP=strip --strip-all endif # Uncomment for gprof profile build #CFLAGS=-Wall -g -pg -pthread $(INCLUDES) $(DEFS) +#CXXFLAGS=-Wall -g -pg -pthread $(INCLUDES) $(DEFS) #LDFLAGS= #STRIP=echo -CXXFLAGS=$(CFLAGS) -fno-rtti - all: one one: $(OBJS) one.o @@ -62,7 +82,8 @@ clean: rm -rf *.o node/*.o controller/*.o osdep/*.o service/*.o ext/http-parser/*.o ext/lz4/*.o ext/json-parser/*.o zerotier-one zerotier-idtool zerotier-cli zerotier-selftest build-* ZeroTierOneInstaller-* *.deb *.rpm debug: FORCE - make -j 4 ZT_DEBUG=1 + make ZT_DEBUG=1 one + make ZT_DEBUG=1 selftest official: FORCE make -j 4 ZT_OFFICIAL_RELEASE=1 From 235d4aba9ab4f461eebfba66e659212d034bab48 Mon Sep 17 00:00:00 2001 From: Moritz Warning Date: Fri, 3 Jul 2015 20:18:19 +0200 Subject: [PATCH 3/3] allow environment variables to set/extend CC, CXX, CFLAGS and LDFLAGS --- make-freebsd.mk | 15 ++++++++------- make-linux.mk | 15 ++++++++------- make-mac.mk | 9 +++++---- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/make-freebsd.mk b/make-freebsd.mk index 90665e0e8..cc428ccbc 100644 --- a/make-freebsd.mk +++ b/make-freebsd.mk @@ -1,5 +1,5 @@ -CC=cc -CXX=c++ +CC?=cc +CXX?=c++ INCLUDES= DEFS= @@ -16,19 +16,20 @@ endif # "make debug" is a shortcut for this ifeq ($(ZT_DEBUG),1) DEFS+=-DZT_TRACE - CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) - LDFLAGS= + CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS) + LDFLAGS+= STRIP=echo # The following line enables optimization for the crypto code, since # C25519 in particular is almost UNUSABLE in heavy testing without it. ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS) else - CFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS) - LDFLAGS=-pie -Wl,-z,relro,-z,now + CFLAGS?=-O3 -fstack-protector + CFLAGS+=-Wall -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS) + LDFLAGS+=-pie -Wl,-z,relro,-z,now STRIP=strip --strip-all endif -CXXFLAGS=$(CFLAGS) -fno-rtti +CXXFLAGS+=$(CFLAGS) -fno-rtti all: one diff --git a/make-linux.mk b/make-linux.mk index 6881926b0..18000c6c6 100644 --- a/make-linux.mk +++ b/make-linux.mk @@ -18,9 +18,8 @@ # # Automagically pick clang or gcc, with preference for clang -CC=$(shell if [ -e /usr/bin/clang ]; then echo clang; else echo gcc; fi) -CXX=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi) - +CC?=$(shell if [ -e /usr/bin/clang ]; then echo clang; else echo gcc; fi) +CXX?=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi) INCLUDES= DEFS= LIBS= @@ -43,16 +42,18 @@ endif # "make debug" is a shortcut for this ifeq ($(ZT_DEBUG),1) DEFS+=-DZT_TRACE - CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) - CXXFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) + CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS) + CXXFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS) LDFLAGS= STRIP=echo # The following line enables optimization for the crypto code, since # C25519 in particular is almost UNUSABLE in -O0 even on a 3ghz box! ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS) else - CFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS) - CXXFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -fno-rtti -pthread $(INCLUDES) -DNDEBUG $(DEFS) + CFLAGS?=-O3 -fstack-protector + CFLAGS+=-Wall -fPIE -fvisibility=hidden -pthread $(INCLUDES) -DNDEBUG $(DEFS) + CXXFLAGS?=-O3 -fstack-protector + CXXFLAGS+=-Wall -fPIE -fvisibility=hidden -fno-rtti -pthread $(INCLUDES) -DNDEBUG $(DEFS) LDFLAGS=-pie -Wl,-z,relro,-z,now STRIP=strip --strip-all endif diff --git a/make-mac.mk b/make-mac.mk index 09fd1e36f..1bc842cee 100644 --- a/make-mac.mk +++ b/make-mac.mk @@ -1,5 +1,5 @@ -CC=clang -CXX=clang++ +CC?=clang +CXX?=clang++ INCLUDES=-I/usr/local/include DEFS= @@ -38,13 +38,14 @@ endif # Debug mode -- dump trace output, build binary with -g ifeq ($(ZT_DEBUG),1) DEFS+=-DZT_TRACE - CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS) + CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS) STRIP=echo # The following line enables optimization for the crypto code, since # C25519 in particular is almost UNUSABLE in heavy testing without it. ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS) else - CFLAGS=$(ARCH_FLAGS) -Wall -O3 -flto -fPIE -fvectorize -fstack-protector -pthread -mmacosx-version-min=10.7 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS) + CFLAGS?=-O3 -fstack-protector + CFLAGS+=$(ARCH_FLAGS) -Wall -flto -fPIE -fvectorize -pthread -mmacosx-version-min=10.7 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS) STRIP=strip endif