checker: don't let failures in add-lease affect checker results. Closes #875.

Mutable servermap updates and the immutable checker, when run with add_lease=True, send both the do-you-have-block and add-lease commands in parallel, to avoid an extra round trip time. Many older servers have problems with add-lease and raise various exceptions, which don't generally matter. The client-side code was catching+ignoring some of them, but unrecognized exceptions were passed through to the DYHB code, concealing the DYHB results from the checker, making it think the server had no shares. The fix is to separate the code paths. Both commands are sent at the same time, but the errback path from add-lease is handled separately. Known exceptions are ignored, the others (both unknown-remote and all-local) are logged (log.WEIRD, which will trigger an Incident), but neither will affect the DYHB results. The add-lease message is sent first, and we know that the server handles them synchronously. So when the checker is done, we can be sure that all the add-lease messages have been retired. This makes life easier for unit tests.
2025-04-13 22:03:04 +00:00 · 2009-12-29 15:01:08 -08:00 · 2009-12-29 15:01:08 -08:00 · 794e32738f
commit 794e32738f
parent a50fdca6ab
3 changed files with 122 additions and 42 deletions
--- a/src/allmydata/immutable/checker.py
+++ b/src/allmydata/immutable/checker.py
@ -71,33 +71,14 @@ class Checker(log.PrefixingLogMixin):
        that we want to track and report whether or not each server
        responded.)"""

-        d = server.callRemote("get_buckets", storageindex)
        if self._add_lease:
            renew_secret = self._get_renewal_secret(serverid)
            cancel_secret = self._get_cancel_secret(serverid)
            d2 = server.callRemote("add_lease", storageindex,
                                   renew_secret, cancel_secret)
-            dl = defer.DeferredList([d, d2], consumeErrors=True)
-            def _done(res):
-                [(get_success, get_result),
-                 (addlease_success, addlease_result)] = res
-                # ignore remote IndexError on the add_lease call. Propagate
-                # local errors and remote non-IndexErrors
-                if addlease_success:
-                    return get_result
-                if not addlease_result.check(RemoteException):
-                    # Propagate local errors
-                    return addlease_result
-                if addlease_result.value.failure.check(IndexError):
-                    # tahoe=1.3.0 raised IndexError on non-existant
-                    # buckets, which we ignore
-                    return get_result
-                # propagate remote errors that aren't IndexError, including
-                # the unfortunate internal KeyError bug that <1.3.0 had.
-                return addlease_result
-            dl.addCallback(_done)
-            d = dl
+            d2.addErrback(self._add_lease_failed, serverid, storageindex)

+        d = server.callRemote("get_buckets", storageindex)
        def _wrap_results(res):
            return (res, serverid, True)

@ -111,6 +92,38 @@ class Checker(log.PrefixingLogMixin):
        d.addCallbacks(_wrap_results, _trap_errs)
        return d

+    def _add_lease_failed(self, f, peerid, storage_index):
+        # Older versions of Tahoe didn't handle the add-lease message very
+        # well: <=1.1.0 throws a NameError because it doesn't implement
+        # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
+        # (which is most of them, since we send add-lease to everybody,
+        # before we know whether or not they have any shares for us), and
+        # 1.2.0 throws KeyError even on known buckets due to an internal bug
+        # in the latency-measuring code.
+
+        # we want to ignore the known-harmless errors and log the others. In
+        # particular we want to log any local errors caused by coding
+        # problems.
+
+        if f.check(RemoteException):
+            if f.value.failure.check(KeyError, IndexError, NameError):
+                # this may ignore a bit too much, but that only hurts us
+                # during debugging
+                return
+            self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
+                     peerid=idlib.shortnodeid_b2a(peerid),
+                     f_value=str(f.value),
+                     failure=f,
+                     level=log.WEIRD, umid="atbAxw")
+            return
+        # local errors are cause for alarm
+        log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
+                peerid=idlib.shortnodeid_b2a(peerid),
+                f_value=str(f.value),
+                failure=f,
+                level=log.WEIRD, umid="hEGuQg")
+
+
    def _download_and_verify(self, serverid, sharenum, bucket):
        """Start an attempt to download and verify every block in this bucket
        and return a deferred that will eventually fire once the attempt
--- a/src/allmydata/mutable/servermap.py
+++ b/src/allmydata/mutable/servermap.py
@ -536,32 +536,19 @@ class ServermapUpdater:
        return d

    def _do_read(self, ss, peerid, storage_index, shnums, readv):
-        d = ss.callRemote("slot_readv", storage_index, shnums, readv)
        if self._add_lease:
+            # send an add-lease message in parallel. The results are handled
+            # separately. This is sent before the slot_readv() so that we can
+            # be sure the add_lease is retired by the time slot_readv comes
+            # back (this relies upon our knowledge that the server code for
+            # add_lease is synchronous).
            renew_secret = self._node.get_renewal_secret(peerid)
            cancel_secret = self._node.get_cancel_secret(peerid)
            d2 = ss.callRemote("add_lease", storage_index,
                               renew_secret, cancel_secret)
-            dl = defer.DeferredList([d, d2], consumeErrors=True)
-            def _done(res):
-                [(readv_success, readv_result),
-                 (addlease_success, addlease_result)] = res
-                # ignore remote IndexError on the add_lease call. Propagate
-                # local errors and remote non-IndexErrors
-                if addlease_success:
-                    return readv_result
-                if not addlease_result.check(RemoteException):
-                    # Propagate local errors
-                    return addlease_result
-                if addlease_result.value.failure.check(IndexError):
-                    # tahoe=1.3.0 raised IndexError on non-existant
-                    # buckets, which we ignore
-                    return readv_result
-                # propagate remote errors that aren't IndexError, including
-                # the unfortunate internal KeyError bug that <1.3.0 had.
-                return addlease_result
-            dl.addCallback(_done)
-            return dl
+            # we ignore success
+            d2.addErrback(self._add_lease_failed, peerid, storage_index)
+        d = ss.callRemote("slot_readv", storage_index, shnums, readv)
        return d

    def _got_results(self, datavs, peerid, readsize, stuff, started):
@ -756,6 +743,37 @@ class ServermapUpdater:
        self._status.set_privkey_from(peerid)


+    def _add_lease_failed(self, f, peerid, storage_index):
+        # Older versions of Tahoe didn't handle the add-lease message very
+        # well: <=1.1.0 throws a NameError because it doesn't implement
+        # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
+        # (which is most of them, since we send add-lease to everybody,
+        # before we know whether or not they have any shares for us), and
+        # 1.2.0 throws KeyError even on known buckets due to an internal bug
+        # in the latency-measuring code.
+
+        # we want to ignore the known-harmless errors and log the others. In
+        # particular we want to log any local errors caused by coding
+        # problems.
+
+        if f.check(RemoteException):
+            if f.value.failure.check(KeyError, IndexError, NameError):
+                # this may ignore a bit too much, but that only hurts us
+                # during debugging
+                return
+            self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
+                     peerid=idlib.shortnodeid_b2a(peerid),
+                     f_value=str(f.value),
+                     failure=f,
+                     level=log.WEIRD, umid="iqg3mw")
+            return
+        # local errors are cause for alarm
+        log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
+                peerid=idlib.shortnodeid_b2a(peerid),
+                f_value=str(f.value),
+                failure=f,
+                level=log.WEIRD, umid="ZWh6HA")
+
    def _query_failed(self, f, peerid):
        if not self._running:
            return
--- a/src/allmydata/test/test_checker.py
+++ b/src/allmydata/test/test_checker.py
@ -4,6 +4,9 @@ from twisted.trial import unittest
 from allmydata import check_results, uri
 from allmydata.web import check_results as web_check_results
 from allmydata.storage_client import StorageFarmBroker, NativeStorageClientDescriptor
+from allmydata.monitor import Monitor
+from allmydata.test.no_network import GridTestMixin
+from allmydata.immutable.upload import Data
 from common_web import WebRenderingMixin

 class FakeClient:
@ -269,3 +272,49 @@ class WebResultsRendering(unittest.TestCase, WebRenderingMixin):
        d.addCallback(_got_lit_results)
        return d

+class AddLease(GridTestMixin, unittest.TestCase):
+    # test for #875, in which failures in the add-lease call cause
+    # false-negatives in the checker
+
+    def test_875(self):
+        self.basedir = "checker/AddLease/875"
+        self.set_up_grid(num_servers=1)
+        c0 = self.g.clients[0]
+        self.uris = {}
+        DATA = "data" * 100
+        d = c0.upload(Data(DATA, convergence=""))
+        def _stash_immutable(ur):
+            self.imm = c0.create_node_from_uri(ur.uri)
+        d.addCallback(_stash_immutable)
+        d.addCallback(lambda ign: c0.create_mutable_file("contents"))
+        def _stash_mutable(node):
+            self.mut = node
+        d.addCallback(_stash_mutable)
+
+        def _check_cr(cr, which):
+            self.failUnless(cr.is_healthy(), which)
+
+        # these two should work normally
+        d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
+        d.addCallback(_check_cr, "immutable-normal")
+        d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
+        d.addCallback(_check_cr, "mutable-normal")
+
+        really_did_break = []
+        # now break the server's remote_add_lease call
+        def _break_add_lease(ign):
+            def broken_add_lease(*args, **kwargs):
+                really_did_break.append(1)
+                raise KeyError("intentional failure, should be ignored")
+            assert self.g.servers_by_number[0].remote_add_lease
+            self.g.servers_by_number[0].remote_add_lease = broken_add_lease
+        d.addCallback(_break_add_lease)
+
+        # and confirm that the files still look healthy
+        d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
+        d.addCallback(_check_cr, "mutable-broken")
+        d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
+        d.addCallback(_check_cr, "immutable-broken")
+
+        d.addCallback(lambda ign: self.failUnless(really_did_break))
+        return d