From 0bc10923a1fc0c1ac1058d5ba4044afa01bda474 Mon Sep 17 00:00:00 2001 From: travisladuke Date: Mon, 7 Aug 2023 12:42:03 -0700 Subject: [PATCH 01/12] Test that starting zerotier before internet works --- .github/workflows/validate-1m-linux.sh | 47 +++++++++++++++----------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/.github/workflows/validate-1m-linux.sh b/.github/workflows/validate-1m-linux.sh index 25d6bd473..c044245a0 100755 --- a/.github/workflows/validate-1m-linux.sh +++ b/.github/workflows/validate-1m-linux.sh @@ -71,26 +71,6 @@ main() { # Allow forwarding sysctl -w net.ipv4.ip_forward=1 - echo -e "\nPing from host to namespaces" - - ping -c 3 192.168.0.1 - ping -c 3 192.168.1.1 - - echo -e "\nPing from namespace to host" - - $NS1 ping -c 3 192.168.0.1 - $NS1 ping -c 3 192.168.0.1 - $NS2 ping -c 3 192.168.0.2 - $NS2 ping -c 3 192.168.0.2 - - echo -e "\nPing from ns1 to ns2" - - $NS1 ping -c 3 192.168.0.1 - - echo -e "\nPing from ns2 to ns1" - - $NS2 ping -c 3 192.168.0.1 - ################################################################################ # Memory Leak Check # ################################################################################ @@ -113,7 +93,34 @@ main() { ./zerotier-one node1 -p9996 -U >>node_1.log 2>&1 & # Second instance, not run in memory profiler + # Don't set up internet access until _after_ zerotier is running + # This has been a source of stuckness in the past. + $NS2 ip addr del 192.168.1.2/24 dev veth3 $NS2 sudo ./zerotier-one node2 -U -p9997 >>node_2.log 2>&1 & + sleep 1; + $NS2 ip addr add 192.168.1.2/24 dev veth3 + $NS2 ip route add default via 192.168.1.1 + + + echo -e "\nPing from host to namespaces" + + ping -c 3 192.168.0.1 + ping -c 3 192.168.1.1 + + echo -e "\nPing from namespace to host" + + $NS1 ping -c 3 192.168.0.1 + $NS1 ping -c 3 192.168.0.1 + $NS2 ping -c 3 192.168.0.2 + $NS2 ping -c 3 192.168.0.2 + + echo -e "\nPing from ns1 to ns2" + + $NS1 ping -c 3 192.168.0.1 + + echo -e "\nPing from ns2 to ns1" + + $NS2 ping -c 3 192.168.0.1 ################################################################################ # Online Check # From 14671009f89a881676289defa0e126d4f6f6bb5a Mon Sep 17 00:00:00 2001 From: travisladuke Date: Fri, 4 Aug 2023 16:12:36 -0700 Subject: [PATCH 02/12] Don't skip hellos when there are no paths available working on #2082 --- node/Node.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/node/Node.cpp b/node/Node.cpp index e2d5f7bf3..0657cbd0b 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -248,9 +248,15 @@ public: const std::vector *const alwaysContactEndpoints = _alwaysContact.get(p->address()); if (alwaysContactEndpoints) { - // Contact upstream peers as infrequently as possible ZT_PeerRole role = RR->topology->role(p->address()); + + // Contact upstream peers as infrequently as possible int roleBasedTimerScale = (role == ZT_PEER_ROLE_LEAF) ? 2 : 16; + + // Unless we don't any have paths to the roots, then we shouldn't wait a long time to contact them + bool hasPaths = p->paths(RR->node->now()).size() > 0; + roleBasedTimerScale = (role != ZT_PEER_ROLE_LEAF && !hasPaths) ? 0 : roleBasedTimerScale; + if ((RR->node->now() - p->lastSentFullHello()) <= (ZT_PATH_HEARTBEAT_PERIOD * roleBasedTimerScale)) { return; } From 82a9122fc3fd58a1fb07777fc103f6789d9e534f Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Thu, 10 Aug 2023 09:58:47 -0700 Subject: [PATCH 03/12] Update validate-1m-linux.sh --- .github/workflows/validate-1m-linux.sh | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/validate-1m-linux.sh b/.github/workflows/validate-1m-linux.sh index c044245a0..1af38fada 100755 --- a/.github/workflows/validate-1m-linux.sh +++ b/.github/workflows/validate-1m-linux.sh @@ -173,6 +173,7 @@ main() { if [[ "$both_instances_online" != "true" ]]; then echo "One or more instances of ZeroTier failed to come online. Aborting test." + collect_zt_dump_files exit 1 fi @@ -283,16 +284,7 @@ main() { # Collect ZeroTier dump files # ################################################################################ - echo -e "\nCollecting ZeroTier dump files" - - node1_id=$($ZT1 -j status | jq -r .address) - node2_id=$($ZT2 -j status | jq -r .address) - - $ZT1 dump - mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node1_id.txt" - - $ZT2 dump - mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node2_id.txt" + collect_zt_dump_files ################################################################################ # Let ZeroTier idle long enough for various timers # @@ -468,4 +460,17 @@ check_exit_on_invalid_identity() { fi } +collect_zt_dump_files() { + echo -e "\nCollecting ZeroTier dump files" + + node1_id=$($ZT1 -j status | jq -r .address) + node2_id=$($ZT2 -j status | jq -r .address) + + $ZT1 dump + mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node1_id.txt" + + $ZT2 dump + mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node2_id.txt" + } + main "$@" From d826ddb2944151167c114bf9cd6b2cfff4eadcb5 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Thu, 10 Aug 2023 11:02:37 -0700 Subject: [PATCH 04/12] Save zt node log files on abort --- .github/workflows/validate-1m-linux.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/validate-1m-linux.sh b/.github/workflows/validate-1m-linux.sh index 1af38fada..de607f14d 100755 --- a/.github/workflows/validate-1m-linux.sh +++ b/.github/workflows/validate-1m-linux.sh @@ -174,7 +174,8 @@ main() { if [[ "$both_instances_online" != "true" ]]; then echo "One or more instances of ZeroTier failed to come online. Aborting test." collect_zt_dump_files - exit 1 + collect_zt_log_files + exit 0 fi echo -e "\nJoining networks" @@ -311,11 +312,10 @@ main() { time_test_end=$(date +%s) ################################################################################ - # Rename ZeroTier stdout/stderr logs # + # Copy ZeroTier stdout/stderr logs # ################################################################################ - mv node_1.log "$TEST_FILEPATH_PREFIX-node-log-$node1_id.txt" - mv node_2.log "$TEST_FILEPATH_PREFIX-node-log-$node2_id.txt" + collect_zt_log_files ################################################################################ # Generate report # @@ -471,6 +471,11 @@ collect_zt_dump_files() { $ZT2 dump mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node2_id.txt" - } +} + +collect_zt_log_files() { + cp node_1.log "$TEST_FILEPATH_PREFIX-node-log-$node1_id.txt" + cp node_2.log "$TEST_FILEPATH_PREFIX-node-log-$node2_id.txt" +} main "$@" From bb9ad5e41a7f2653e8c1302e9b9bfe4cad0027e1 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Thu, 10 Aug 2023 15:37:45 -0700 Subject: [PATCH 05/12] Separate test and summary step in validator script --- ...validate-1m-linux.sh => validate-linux.sh} | 162 ++++++++---------- .../{report.sh => validate-report.sh} | 9 +- .github/workflows/validate.yml | 8 +- 3 files changed, 83 insertions(+), 96 deletions(-) rename .github/workflows/{validate-1m-linux.sh => validate-linux.sh} (74%) rename .github/workflows/{report.sh => validate-report.sh} (67%) diff --git a/.github/workflows/validate-1m-linux.sh b/.github/workflows/validate-linux.sh similarity index 74% rename from .github/workflows/validate-1m-linux.sh rename to .github/workflows/validate-linux.sh index de607f14d..2e393f391 100755 --- a/.github/workflows/validate-1m-linux.sh +++ b/.github/workflows/validate-linux.sh @@ -3,26 +3,35 @@ # This test script joins Earth and pokes some stuff TEST_NETWORK=8056c2e21c000001 -RUN_LENGTH=60 +RUN_LENGTH=20 TEST_FINISHED=false ZTO_VER=$(git describe --tags $(git rev-list --tags --max-count=1)) ZTO_COMMIT=$(git rev-parse HEAD) ZTO_COMMIT_SHORT=$(git rev-parse --short HEAD) TEST_DIR_PREFIX="$ZTO_VER-$ZTO_COMMIT_SHORT-test-results" -EXIT_TEST_FAILED=0 + +TEST_OK=0 +TEST_FAIL=1 echo "Performing test on: $ZTO_VER-$ZTO_COMMIT_SHORT" TEST_FILEPATH_PREFIX="$TEST_DIR_PREFIX/$ZTO_COMMIT_SHORT" mkdir $TEST_DIR_PREFIX +# How long we will wait for ZT to come online before considering it a failure +MAX_WAIT_SECS=60 + ################################################################################ # Multi-node connectivity and performance test # ################################################################################ -main() { - echo -e "\nRunning test for $RUN_LENGTH seconds" + +test() { + + echo -e "\nPerforming pre-flight checks" check_exit_on_invalid_identity + echo -e "\nRunning test for $RUN_LENGTH seconds" + NS1="ip netns exec ns1" NS2="ip netns exec ns2" @@ -75,11 +84,11 @@ main() { # Memory Leak Check # ################################################################################ - FILENAME_MEMORY_LOG="$TEST_FILEPATH_PREFIX-memory.log" + export FILENAME_MEMORY_LOG="$TEST_FILEPATH_PREFIX-memory.log" echo -e "\nStarting a ZeroTier instance in each namespace..." - time_test_start=$(date +%s) + export time_test_start=$(date +%s) # Spam the CLI as ZeroTier is starting spam_cli 100 @@ -127,7 +136,6 @@ main() { ################################################################################ echo "Waiting for ZeroTier to come online before attempting test..." - MAX_WAIT_SECS="${MAX_WAIT_SECS:-120}" node1_online=false node2_online=false both_instances_online=false @@ -139,13 +147,13 @@ main() { node2_online="$($ZT2 -j info | jq '.online' 2>/dev/null)" echo "Checking for online status: try #$s, node1:$node1_online, node2:$node2_online" if [[ "$node1_online" == "true" ]]; then - time_zt_node1_online=$(date +%s) + export time_zt_node1_online=$(date +%s) fi if [[ "$node2_online" == "true" ]]; then - time_zt_node2_online=$(date +%s) + export time_zt_node2_online=$(date +%s) fi if [[ "$node2_online" == "true" && "$node1_online" == "true" ]]; then - both_instances_online=true + export both_instances_online=true break fi sleep 1 @@ -172,10 +180,7 @@ main() { $ZT2 status if [[ "$both_instances_online" != "true" ]]; then - echo "One or more instances of ZeroTier failed to come online. Aborting test." - collect_zt_dump_files - collect_zt_log_files - exit 0 + exit_test_and_generate_report $TEST_FAIL "one or more nodes failed to come online" fi echo -e "\nJoining networks" @@ -199,18 +204,14 @@ main() { $NS1 ping -c 16 $node2_ip4 >$PING12_FILENAME $NS2 ping -c 16 $node1_ip4 >$PING21_FILENAME - # Parse ping statistics - ping_loss_percent_1_to_2="${ping_loss_percent_1_to_2:-100.0}" - ping_loss_percent_2_to_1="${ping_loss_percent_2_to_1:-100.0}" - ping_loss_percent_1_to_2=$(cat $PING12_FILENAME | grep "packet loss" | awk '{print $6}' | sed 's/%//') ping_loss_percent_2_to_1=$(cat $PING21_FILENAME | grep "packet loss" | awk '{print $6}' | sed 's/%//') # Normalize loss value - ping_loss_percent_1_to_2=$(echo "scale=2; $ping_loss_percent_1_to_2/100.0" | bc) - ping_loss_percent_2_to_1=$(echo "scale=2; $ping_loss_percent_2_to_1/100.0" | bc) + export ping_loss_percent_1_to_2=$(echo "scale=2; $ping_loss_percent_1_to_2/100.0" | bc) + export ping_loss_percent_2_to_1=$(echo "scale=2; $ping_loss_percent_2_to_1/100.0" | bc) ################################################################################ # CLI Check # @@ -261,11 +262,9 @@ main() { # TODO: Validate JSON - ################################################################################ - # Performance Test # - ################################################################################ + # Performance Test - FILENAME_PERF_JSON="$TEST_FILEPATH_PREFIX-iperf.json" + export FILENAME_PERF_JSON="$TEST_FILEPATH_PREFIX-iperf.json" echo -e "\nBeginning performance test:" @@ -281,15 +280,7 @@ main() { cat $FILENAME_PERF_JSON - ################################################################################ - # Collect ZeroTier dump files # - ################################################################################ - - collect_zt_dump_files - - ################################################################################ - # Let ZeroTier idle long enough for various timers # - ################################################################################ + # Let ZeroTier idle long enough for various timers echo -e "\nIdling ZeroTier for $RUN_LENGTH seconds..." sleep $RUN_LENGTH @@ -301,25 +292,44 @@ main() { sleep 5 - ################################################################################ - # Stop test # - ################################################################################ + # Stop test echo -e "\nStopping memory check..." sudo pkill -15 -f valgrind sleep 10 - time_test_end=$(date +%s) + export time_test_end=$(date +%s) - ################################################################################ - # Copy ZeroTier stdout/stderr logs # - ################################################################################ + exit_test_and_generate_report $TEST_OK "completed test" +} - collect_zt_log_files +################################################################################ +# Generate report # +################################################################################ - ################################################################################ - # Generate report # - ################################################################################ +exit_test_and_generate_report() { + + echo "Exiting test with reason: $2 ($1)" + + # Collect ZeroTier dump files + + echo -e "\nCollecting ZeroTier dump files" + + node1_id=$($ZT1 -j status | jq -r .address) + node2_id=$($ZT2 -j status | jq -r .address) + + $ZT1 dump + mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node1_id.txt" + + $ZT2 dump + mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node2_id.txt" + + # Copy ZeroTier stdout/stderr logs + + cp node_1.log "$TEST_FILEPATH_PREFIX-node-log-$node1_id.txt" + cp node_2.log "$TEST_FILEPATH_PREFIX-node-log-$node2_id.txt" + + # Generate report cat $FILENAME_MEMORY_LOG @@ -328,9 +338,7 @@ main() { POSSIBLY_LOST=$(xmlstarlet sel -t -v '/valgrindoutput/error/xwhat' \ $FILENAME_MEMORY_LOG | grep "possibly" | awk '{print $1;}') - ################################################################################ - # Generate coverage report artifact and summary # - ################################################################################ + # Generate coverage report artifact and summary FILENAME_COVERAGE_JSON="$TEST_FILEPATH_PREFIX-coverage.json" FILENAME_COVERAGE_HTML="$TEST_FILEPATH_PREFIX-coverage.html" @@ -350,22 +358,20 @@ main() { COVERAGE_LINE_TOTAL="${COVERAGE_LINE_TOTAL:-0}" COVERAGE_LINE_PERCENT="${COVERAGE_LINE_PERCENT:-0}" - ################################################################################ - # Default values # - ################################################################################ + # Default values DEFINITELY_LOST="${DEFINITELY_LOST:-0}" POSSIBLY_LOST="${POSSIBLY_LOST:-0}" + ping_loss_percent_1_to_2="${ping_loss_percent_1_to_2:-100.0}" + ping_loss_percent_2_to_1="${ping_loss_percent_2_to_1:-100.0}" - ################################################################################ - # Summarize and emit json for trend reporting # - ################################################################################ + # Summarize and emit json for trend reporting FILENAME_SUMMARY="$TEST_FILEPATH_PREFIX-summary.json" time_length_test=$((time_test_end - time_test_start)) - time_length_zt_node1_online=$((time_zt_node1_online - time_zt_start)) - time_length_zt_node2_online=$((time_zt_node2_online - time_zt_start)) + time_to_node1_online=$((time_zt_node1_online - time_zt_start)) + time_to_node2_online=$((time_zt_node2_online - time_zt_start)) #time_length_zt_join=$((time_zt_join_end-time_zt_join_start)) #time_length_zt_leave=$((time_zt_leave_end-time_zt_leave_start)) #time_length_zt_can_still_ping=$((time_zt_can_still_ping-time_zt_leave_start)) @@ -377,25 +383,20 @@ main() { "commit":"$ZTO_COMMIT", "arch_m":"$(uname -m)", "arch_a":"$(uname -a)", + "binary_size":"$(stat -c %s zerotier-one)" "time_length_test":$time_length_test, - "time_length_zt_node1_online":$time_length_zt_node1_online, - "time_length_zt_node2_online":$time_length_zt_node2_online, + "time_to_node1_online":$time_to_node1_online, + "time_to_node2_online":$time_to_node2_online, "num_possible_bytes_lost": $POSSIBLY_LOST, "num_definite_bytes_lost": $DEFINITELY_LOST, - "num_incorrect_settings": $POSSIBLY_LOST, "num_bad_formattings": $POSSIBLY_LOST, - "percent_coverage_branches": $POSSIBLY_LOST, "coverage_lines_covered": $COVERAGE_LINE_COVERED, "coverage_lines_total": $COVERAGE_LINE_TOTAL, "coverage_lines_percent": $COVERAGE_LINE_PERCENT, "ping_loss_percent_1_to_2": $ping_loss_percent_1_to_2, "ping_loss_percent_2_to_1": $ping_loss_percent_2_to_1, - "mean_latency_ping_random": $POSSIBLY_LOST, - "mean_latency_ping_netns": $POSSIBLY_LOST, - "mean_pdv_random": $POSSIBLY_LOST, - "mean_pdv_netns": $POSSIBLY_LOST, - "mean_perf_netns": $POSSIBLY_LOST, - "exit_test_failed": $EXIT_TEST_FAILED + "test_exit_code": $1, + "test_exit_reason":"$2" } EOF ) @@ -436,6 +437,10 @@ spam_cli() { done } +################################################################################ +# Check for proper exit on load of invalid identity # +################################################################################ + check_exit_on_invalid_identity() { echo "Checking ZeroTier exits on invalid identity..." mkdir -p $(pwd)/exit_test @@ -447,35 +452,14 @@ check_exit_on_invalid_identity() { $ZT1 & my_pid=$! - echo "Waiting 5 secons" + echo "Waiting 5 seconds" sleep 5 # check if process is running kill -0 $my_pid if [ $? -eq 0 ]; then - EXIT_TEST_FAILED=1 - echo "Exit test FAILED: Process still running after being fed an invalid identity" - else - echo "Exit test PASSED" + exit_test_and_generate_report $TEST_FAIL "Exit test FAILED: Process still running after being fed an invalid identity" fi } -collect_zt_dump_files() { - echo -e "\nCollecting ZeroTier dump files" - - node1_id=$($ZT1 -j status | jq -r .address) - node2_id=$($ZT2 -j status | jq -r .address) - - $ZT1 dump - mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node1_id.txt" - - $ZT2 dump - mv zerotier_dump.txt "$TEST_FILEPATH_PREFIX-node-dump-$node2_id.txt" -} - -collect_zt_log_files() { - cp node_1.log "$TEST_FILEPATH_PREFIX-node-log-$node1_id.txt" - cp node_2.log "$TEST_FILEPATH_PREFIX-node-log-$node2_id.txt" -} - -main "$@" +test "$@" diff --git a/.github/workflows/report.sh b/.github/workflows/validate-report.sh similarity index 67% rename from .github/workflows/report.sh rename to .github/workflows/validate-report.sh index c79139544..3ae4e1a16 100755 --- a/.github/workflows/report.sh +++ b/.github/workflows/validate-report.sh @@ -5,6 +5,8 @@ ################################################################################ DEFINITELY_LOST=$(cat *test-results/*summary.json | jq .num_definite_bytes_lost) +EXIT_CODE=$(cat *test-results/*summary.json | jq .exit_code) +EXIT_REASON=$(cat *test-results/*summary.json | jq .exit_reason) cat *test-results/*summary.json @@ -14,8 +16,9 @@ if [[ "$DEFINITELY_LOST" -gt 0 ]]; then exit 1 fi -EXIT_TEST_FAILED=$(cat *test-results/*summary.json | jq .exit_test_failed) +# Catch-all for other non-zero exit codes -if [[ "$EXIT_TEST_FAILED" -gt 0 ]]; then +if [[ "$EXIT_CODE" -gt 0 ]]; then + echo "Test failed: $EXIT_REASON" exit 1 -fi +fi \ No newline at end of file diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index c95d8e599..bb362fb8c 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -40,8 +40,8 @@ jobs: run: | sudo apt install -y valgrind xmlstarlet gcovr iperf3 tree make one ZT_COVERAGE=1 ZT_TRACE=1 - sudo chmod +x ./.github/workflows/validate-1m-linux.sh - sudo ./.github/workflows/validate-1m-linux.sh + sudo chmod +x ./.github/workflows/validate-linux.sh + sudo ./.github/workflows/validate-linux.sh - name: Archive test results uses: actions/upload-artifact@v3 @@ -51,6 +51,6 @@ jobs: - name: final-report run: | - sudo chmod +x ./.github/workflows/report.sh - sudo ./.github/workflows/report.sh + sudo chmod +x ./.github/workflows/validate-report.sh + sudo ./.github/workflows/validate-report.sh From b071d6bbecff3b64f77d31727f336b596dffc7fd Mon Sep 17 00:00:00 2001 From: travisladuke Date: Fri, 14 Jul 2023 13:45:45 -0700 Subject: [PATCH 06/12] Don't apply default route until zerotier is "online" I was running into issues with restarting the zerotier service while "full tunnel" mode is enabled. When zerotier first boots, it gets network state from the cache on disk. So it immediately applies all the routes it knew about before it shutdown. The network config may have change in this time. If it has, then your default route is via a route you are blocked from talking on. So you can't get the current network config, so your internet does not work. Other options include - don't use cached network state on boot - find a better criteria than "online" --- service/OneService.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/service/OneService.cpp b/service/OneService.cpp index ba321282c..0a9842a7d 100644 --- a/service/OneService.cpp +++ b/service/OneService.cpp @@ -2617,8 +2617,9 @@ public: r->second->sync(); } for(std::map< InetAddress, SharedPtr >::iterator r(n.managedRoutes().begin());r!=n.managedRoutes().end();++r) { - if (r->second->via()) + if (r->second->via() && (!r->second->target().isDefaultRoute() || _node->online())) { r->second->sync(); + } } } From 424e2761503d8a507c1391645fe9d3002593103c Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Fri, 11 Aug 2023 10:14:29 -0700 Subject: [PATCH 07/12] Fix node time-to-online counter in validator script --- .github/workflows/validate-linux.sh | 50 +++++++++++++---------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/.github/workflows/validate-linux.sh b/.github/workflows/validate-linux.sh index 2e393f391..abe318600 100755 --- a/.github/workflows/validate-linux.sh +++ b/.github/workflows/validate-linux.sh @@ -3,7 +3,7 @@ # This test script joins Earth and pokes some stuff TEST_NETWORK=8056c2e21c000001 -RUN_LENGTH=20 +RUN_LENGTH=30 TEST_FINISHED=false ZTO_VER=$(git describe --tags $(git rev-list --tags --max-count=1)) ZTO_COMMIT=$(git rev-parse HEAD) @@ -18,7 +18,7 @@ TEST_FILEPATH_PREFIX="$TEST_DIR_PREFIX/$ZTO_COMMIT_SHORT" mkdir $TEST_DIR_PREFIX # How long we will wait for ZT to come online before considering it a failure -MAX_WAIT_SECS=60 +MAX_WAIT_SECS=30 ################################################################################ # Multi-node connectivity and performance test # @@ -110,7 +110,6 @@ test() { $NS2 ip addr add 192.168.1.2/24 dev veth3 $NS2 ip route add default via 192.168.1.1 - echo -e "\nPing from host to namespaces" ping -c 3 192.168.0.1 @@ -142,18 +141,13 @@ test() { time_zt_node1_start=$(date +%s) time_zt_node2_start=$(date +%s) - for ((s = 0; s <= MAX_WAIT_SECS; s++)); do + for ((s = 0; s <= $MAX_WAIT_SECS; s++)); do node1_online="$($ZT1 -j info | jq '.online' 2>/dev/null)" node2_online="$($ZT2 -j info | jq '.online' 2>/dev/null)" echo "Checking for online status: try #$s, node1:$node1_online, node2:$node2_online" - if [[ "$node1_online" == "true" ]]; then - export time_zt_node1_online=$(date +%s) - fi - if [[ "$node2_online" == "true" ]]; then - export time_zt_node2_online=$(date +%s) - fi if [[ "$node2_online" == "true" && "$node1_online" == "true" ]]; then export both_instances_online=true + export time_to_both_nodes_online=$(date +%s) break fi sleep 1 @@ -167,16 +161,16 @@ test() { tree node2 echo -e "\n\nRunning ZeroTier processes:" - echo -e "\nNode 1:" + echo -e "\nNode 1:\n" $NS1 ps aux | grep zerotier-one - echo -e "\nNode 2:" + echo -e "\nNode 2:\n" $NS2 ps aux | grep zerotier-one echo -e "\n\nStatus of each instance:" - echo -e "\n\nNode 1:" + echo -e "\n\nNode 1:\n" $ZT1 status - echo -e "\n\nNode 2:" + echo -e "\n\nNode 2:\n" $ZT2 status if [[ "$both_instances_online" != "true" ]]; then @@ -292,14 +286,6 @@ test() { sleep 5 - # Stop test - - echo -e "\nStopping memory check..." - sudo pkill -15 -f valgrind - sleep 10 - - export time_test_end=$(date +%s) - exit_test_and_generate_report $TEST_OK "completed test" } @@ -309,6 +295,12 @@ test() { exit_test_and_generate_report() { + echo -e "\nStopping memory check..." + sudo pkill -15 -f valgrind + sleep 10 + + time_test_end=$(date +%s) + echo "Exiting test with reason: $2 ($1)" # Collect ZeroTier dump files @@ -364,14 +356,17 @@ exit_test_and_generate_report() { POSSIBLY_LOST="${POSSIBLY_LOST:-0}" ping_loss_percent_1_to_2="${ping_loss_percent_1_to_2:-100.0}" ping_loss_percent_2_to_1="${ping_loss_percent_2_to_1:-100.0}" + time_to_both_nodes_online="${time_to_both_nodes_online:--1}" # Summarize and emit json for trend reporting FILENAME_SUMMARY="$TEST_FILEPATH_PREFIX-summary.json" time_length_test=$((time_test_end - time_test_start)) - time_to_node1_online=$((time_zt_node1_online - time_zt_start)) - time_to_node2_online=$((time_zt_node2_online - time_zt_start)) + if [[ $time_to_both_nodes_online != -1 ]]; + then + time_to_both_nodes_online=$((time_to_both_nodes_online - time_test_start)) + fi #time_length_zt_join=$((time_zt_join_end-time_zt_join_start)) #time_length_zt_leave=$((time_zt_leave_end-time_zt_leave_start)) #time_length_zt_can_still_ping=$((time_zt_can_still_ping-time_zt_leave_start)) @@ -383,10 +378,9 @@ exit_test_and_generate_report() { "commit":"$ZTO_COMMIT", "arch_m":"$(uname -m)", "arch_a":"$(uname -a)", - "binary_size":"$(stat -c %s zerotier-one)" + "binary_size":"$(stat -c %s zerotier-one)", "time_length_test":$time_length_test, - "time_to_node1_online":$time_to_node1_online, - "time_to_node2_online":$time_to_node2_online, + "time_to_both_nodes_online":$time_to_both_nodes_online, "num_possible_bytes_lost": $POSSIBLY_LOST, "num_definite_bytes_lost": $DEFINITELY_LOST, "num_bad_formattings": $POSSIBLY_LOST, @@ -403,6 +397,8 @@ EOF echo $summary >$FILENAME_SUMMARY cat $FILENAME_SUMMARY + + exit 0 } ################################################################################ From a6d5c452d5b54d7e0479570cb40f110be0717f10 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Fri, 11 Aug 2023 10:35:42 -0700 Subject: [PATCH 08/12] Export variables so that they are accessible by exit function --- .github/workflows/validate-linux.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/validate-linux.sh b/.github/workflows/validate-linux.sh index abe318600..61670d670 100755 --- a/.github/workflows/validate-linux.sh +++ b/.github/workflows/validate-linux.sh @@ -32,12 +32,12 @@ test() { echo -e "\nRunning test for $RUN_LENGTH seconds" - NS1="ip netns exec ns1" - NS2="ip netns exec ns2" + export NS1="ip netns exec ns1" + export NS2="ip netns exec ns2" - ZT1="$NS1 ./zerotier-cli -p9996 -D$(pwd)/node1" + export ZT1="$NS1 ./zerotier-cli -p9996 -D$(pwd)/node1" # Specify custom port on one node to ensure that feature works - ZT2="$NS2 ./zerotier-cli -p9997 -D$(pwd)/node2" + export ZT2="$NS2 ./zerotier-cli -p9997 -D$(pwd)/node2" echo -e "\nSetting up network namespaces..." echo "Setting up ns1" From b81ad9a84dc14f0e03cebf4f39a7be0dafb8b43a Mon Sep 17 00:00:00 2001 From: travisladuke Date: Wed, 2 Aug 2023 15:07:09 -0700 Subject: [PATCH 09/12] Fix PortMapper issue on ZeroTier startup See issue #2082 We use a call to libnatpmp::ininatpp to make sure the computer has working network sockets before we go into the main nat-pmp/upnp logic. With basic exponenetial delay up to 30 seconds. --- osdep/PortMapper.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/osdep/PortMapper.cpp b/osdep/PortMapper.cpp index 0027ed69c..fce985bf4 100644 --- a/osdep/PortMapper.cpp +++ b/osdep/PortMapper.cpp @@ -79,6 +79,7 @@ public: throw() { int mode = 0; // 0 == NAT-PMP, 1 == UPnP + int retrytime = 500; #ifdef ZT_PORTMAPPER_TRACE fprintf(stderr,"PortMapper: started for UDP port %d" ZT_EOL_S,localPort); @@ -86,6 +87,24 @@ public: while (run) { + // use initnatpmp to check if we can bind a port at all + natpmp_t _natpmp; + int result = initnatpmp(&_natpmp,0,0); + if (result !=0 ) { + closenatpmp(&_natpmp); +#ifdef ZT_PORTMAPPER_TRACE + PM_TRACE("PortMapper: init failed %d. You might not have any IP addresses yet. Trying again in %d" ZT_EOL_S, retrytime); +#endif + Thread::sleep(retrytime); + retrytime = retrytime * 2; + if (retrytime > ZT_PORTMAPPER_REFRESH_DELAY / 10) { + retrytime = ZT_PORTMAPPER_REFRESH_DELAY / 10; + } + continue; + } else { + closenatpmp(&_natpmp); + retrytime = 500; + } // --------------------------------------------------------------------- // NAT-PMP mode (preferred) // --------------------------------------------------------------------- @@ -172,6 +191,7 @@ public: #ifdef ZT_PORTMAPPER_TRACE PM_TRACE("PortMapper: NAT-PMP: request failed, switching to UPnP mode" ZT_EOL_S); #endif + continue; } } // --------------------------------------------------------------------- @@ -293,6 +313,7 @@ public: #ifdef ZT_PORTMAPPER_TRACE PM_TRACE("PortMapper: upnpDiscover failed, returning to NAT-PMP mode: %d" ZT_EOL_S,upnpError); #endif + break; } } // --------------------------------------------------------------------- From f2060e0c7622f3899d6de93d50b5ffae27145322 Mon Sep 17 00:00:00 2001 From: travisladuke Date: Fri, 11 Aug 2023 08:41:13 -0700 Subject: [PATCH 10/12] testing --- osdep/PortMapper.cpp | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/osdep/PortMapper.cpp b/osdep/PortMapper.cpp index fce985bf4..519aba917 100644 --- a/osdep/PortMapper.cpp +++ b/osdep/PortMapper.cpp @@ -14,7 +14,7 @@ #ifdef ZT_USE_MINIUPNPC // Uncomment to dump debug messages -//#define ZT_PORTMAPPER_TRACE 1 +#define ZT_PORTMAPPER_TRACE 1 #ifdef __ANDROID__ #include @@ -87,23 +87,25 @@ public: while (run) { - // use initnatpmp to check if we can bind a port at all - natpmp_t _natpmp; - int result = initnatpmp(&_natpmp,0,0); - if (result !=0 ) { - closenatpmp(&_natpmp); + { + // use initnatpmp to check if we can bind a port at all + natpmp_t _natpmp; + int result = initnatpmp(&_natpmp,0,0); + if (result == NATPMP_ERR_CANNOTGETGATEWAY || result == NATPMP_ERR_SOCKETERROR) { + closenatpmp(&_natpmp); #ifdef ZT_PORTMAPPER_TRACE - PM_TRACE("PortMapper: init failed %d. You might not have any IP addresses yet. Trying again in %d" ZT_EOL_S, retrytime); + PM_TRACE("PortMapper: init failed %d. You might not have an internet connection yet. Trying again in %d" ZT_EOL_S, result, retrytime); #endif - Thread::sleep(retrytime); - retrytime = retrytime * 2; - if (retrytime > ZT_PORTMAPPER_REFRESH_DELAY / 10) { - retrytime = ZT_PORTMAPPER_REFRESH_DELAY / 10; + Thread::sleep(retrytime); + retrytime = retrytime * 2; + if (retrytime > ZT_PORTMAPPER_REFRESH_DELAY / 10) { + retrytime = ZT_PORTMAPPER_REFRESH_DELAY / 10; + } + continue; + } else { + closenatpmp(&_natpmp); + retrytime = 500; } - continue; - } else { - closenatpmp(&_natpmp); - retrytime = 500; } // --------------------------------------------------------------------- // NAT-PMP mode (preferred) @@ -313,7 +315,6 @@ public: #ifdef ZT_PORTMAPPER_TRACE PM_TRACE("PortMapper: upnpDiscover failed, returning to NAT-PMP mode: %d" ZT_EOL_S,upnpError); #endif - break; } } // --------------------------------------------------------------------- From d976a9f5a0c12f961cf778d386aa9a55d907db6a Mon Sep 17 00:00:00 2001 From: travisladuke Date: Fri, 11 Aug 2023 12:17:06 -0700 Subject: [PATCH 11/12] Comment out PortMapper debug this got left turned on in a confusing merge previously --- osdep/PortMapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osdep/PortMapper.cpp b/osdep/PortMapper.cpp index 519aba917..1228a95a2 100644 --- a/osdep/PortMapper.cpp +++ b/osdep/PortMapper.cpp @@ -14,7 +14,7 @@ #ifdef ZT_USE_MINIUPNPC // Uncomment to dump debug messages -#define ZT_PORTMAPPER_TRACE 1 +//#define ZT_PORTMAPPER_TRACE 1 #ifdef __ANDROID__ #include From 1d095e81d94ebe29ab086d84b6401accde2964fa Mon Sep 17 00:00:00 2001 From: travisladuke Date: Fri, 11 Aug 2023 11:06:25 -0700 Subject: [PATCH 12/12] fix macos default route again see commit fb6af1971 * Fix network DNS on macOS adding that stuff to System Config causes this extra route to be added which breaks ipv4 default route. We figured out a weird System Coniguration setting that works. --- old couldn't figure out how to fix it in SystemConfiguration so here we are# Please enter the commit message for your changes. Lines starting We also moved the dns setter to before the syncIps stuff to help with a race condition. It didn't always work when you re-joined a network with default route enabled. --- osdep/MacDNSHelper.mm | 5 ++- osdep/ManagedRoute.cpp | 71 +++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/osdep/MacDNSHelper.mm b/osdep/MacDNSHelper.mm index aab244f74..5e170e172 100644 --- a/osdep/MacDNSHelper.mm +++ b/osdep/MacDNSHelper.mm @@ -107,7 +107,6 @@ void MacDNSHelper::removeDNS(uint64_t nwid) bool MacDNSHelper::addIps4(uint64_t nwid, const MAC mac, const char *dev, const std::vector& addrs) { const char* ipStr = {0}; - const char* ipStr2 = {0}; char buf2[256] = {0}; bool hasV4 = false; @@ -116,7 +115,6 @@ bool MacDNSHelper::addIps4(uint64_t nwid, const MAC mac, const char *dev, const hasV4 = true; ipStr = addrs[i].toIpString(buf2); - ipStr2 = addrs[i].toIpString(buf2); break; } @@ -141,7 +139,8 @@ bool MacDNSHelper::addIps4(uint64_t nwid, const MAC mac, const char *dev, const CFStringRef cfdev = CFStringCreateWithCString(NULL, dev, kCFStringEncodingUTF8); CFStringRef cfserver = CFStringCreateWithCString(NULL, "127.0.0.1", kCFStringEncodingUTF8); - CFStringRef cfrouter = CFStringCreateWithCString(NULL, ipStr2, kCFStringEncodingUTF8); + // using the ip from the zerotier network breaks routing on the mac + CFStringRef cfrouter = CFStringCreateWithCString(NULL, "127.0.0.1", kCFStringEncodingUTF8); const int SIZE = 4; CFStringRef keys[SIZE]; diff --git a/osdep/ManagedRoute.cpp b/osdep/ManagedRoute.cpp index 1af9f6a93..a06ba11a3 100644 --- a/osdep/ManagedRoute.cpp +++ b/osdep/ManagedRoute.cpp @@ -252,7 +252,7 @@ static std::vector<_RTE> _getRTEs(const InetAddress &target,bool contains) static void _routeCmd(const char *op,const InetAddress &target,const InetAddress &via,const char *ifscope,const char *localInterface) { - //char f1[1024],f2[1024]; printf("%s %s %s %s %s\n",op,target.toString(f1),via.toString(f2),ifscope,localInterface); + // char f1[1024],f2[1024]; printf("cmd %s %s %s %s %s\n",op,target.toString(f1),via.toString(f2),ifscope,localInterface); long p = (long)fork(); if (p > 0) { int exitcode = -1; @@ -479,6 +479,9 @@ bool ManagedRoute::sync() if (hasRoute) { break; } } + // char buf[255]; + // fprintf(stderr, "hasRoute %d %s\n", !!hasRoute, _target.toString(buf)); + if (!hasRoute) { if (_target && _target.netmaskBits() == 0) { @@ -486,46 +489,58 @@ bool ManagedRoute::sync() char newSystemDevice[128]; newSystemDevice[0] = (char)0; - // Find system default route that this route should override - // We need to put it back when default route is turned off - for(std::vector<_RTE>::iterator r(rtes.begin());r!=rtes.end();++r) { - if (r->via) { - if ( !_systemVia && r->isDefault == 1 && (strcmp(r->device,_device) != 0) ) { - - newSystemVia = r->via; - Utils::scopy(newSystemDevice,sizeof(newSystemDevice),r->device); - } - } - } - - if (!newSystemVia) { return false; } - - // Get device corresponding to route if we don't have that already - if ((newSystemVia)&&(!newSystemDevice[0])) { - rtes = _getRTEs(newSystemVia,true); + // If macos has a network hiccup, it deletes what _systemVia we had set. + // Then we don't know how to set the default route again. + // So use the one we had set previously. Don't overwrite it. + if (!_systemVia) { + // Find system default route that this route should override + // We need to put it back when default route is turned off for(std::vector<_RTE>::iterator r(rtes.begin());r!=rtes.end();++r) { - if ( (r->device[0]) && (strcmp(r->device,_device) != 0) && r->target.netmaskBits() != 0) { - Utils::scopy(newSystemDevice,sizeof(newSystemDevice),r->device); - break; + if (r->via) { + if ( !_systemVia && r->isDefault == 1 && (strcmp(r->device,_device) != 0) ) { + + newSystemVia = r->via; + Utils::scopy(newSystemDevice,sizeof(newSystemDevice),r->device); + } } } + if (newSystemVia) { _systemVia = newSystemVia; } } - if (!newSystemDevice[0]) { return false; } - // update the system via in case it changed out from under us - // while we were in default route mode + // char buf1[255], buf2[255]; + // fprintf(stderr, "_systemVia %s new %s\n", _systemVia.toString(buf1), newSystemVia.toString(buf2)); + if (!_systemVia) { return false; } - _systemVia = newSystemVia; - Utils::scopy(_systemDevice,sizeof(_systemDevice),newSystemDevice); + if (!_systemDevice[0]) { + // Get device corresponding to route if we don't have that already + if ((newSystemVia)&&(!newSystemDevice[0])) { + rtes = _getRTEs(newSystemVia,true); + for(std::vector<_RTE>::iterator r(rtes.begin());r!=rtes.end();++r) { + if ( (r->device[0]) && (strcmp(r->device,_device) != 0) && r->target.netmaskBits() != 0) { + Utils::scopy(newSystemDevice,sizeof(newSystemDevice),r->device); + break; + } + } + } - // Do the actual default route commands + if (newSystemDevice[0]) { + Utils::scopy(_systemDevice,sizeof(_systemDevice),newSystemDevice); + } + } + // fprintf(stderr, "_systemDevice %s new %s\n", _systemDevice, newSystemDevice); + if (!_systemDevice[0]) { return false; } + + + // Do Default Route route commands _routeCmd("delete",_target,_systemVia,(const char *)0,(const char *)0); _routeCmd("add",_target,_via,(const char *)0,(const char *)0); _routeCmd("add",_target,_systemVia,_systemDevice,(const char *)0); + _applied[_target] = true; + } else { - // Do the actual route commands + // Do Non-Default route commands _applied[_target] = true; _routeCmd("add",leftt,_via,(const char *)0,(_via) ? (const char *)0 : _device); }