diff --git a/src/allmydata/immutable/checker.py b/src/allmydata/immutable/checker.py index 6e4b89e05..a4c95afb3 100644 --- a/src/allmydata/immutable/checker.py +++ b/src/allmydata/immutable/checker.py @@ -71,33 +71,14 @@ class Checker(log.PrefixingLogMixin): that we want to track and report whether or not each server responded.)""" - d = server.callRemote("get_buckets", storageindex) if self._add_lease: renew_secret = self._get_renewal_secret(serverid) cancel_secret = self._get_cancel_secret(serverid) d2 = server.callRemote("add_lease", storageindex, renew_secret, cancel_secret) - dl = defer.DeferredList([d, d2], consumeErrors=True) - def _done(res): - [(get_success, get_result), - (addlease_success, addlease_result)] = res - # ignore remote IndexError on the add_lease call. Propagate - # local errors and remote non-IndexErrors - if addlease_success: - return get_result - if not addlease_result.check(RemoteException): - # Propagate local errors - return addlease_result - if addlease_result.value.failure.check(IndexError): - # tahoe=1.3.0 raised IndexError on non-existant - # buckets, which we ignore - return get_result - # propagate remote errors that aren't IndexError, including - # the unfortunate internal KeyError bug that <1.3.0 had. - return addlease_result - dl.addCallback(_done) - d = dl + d2.addErrback(self._add_lease_failed, serverid, storageindex) + d = server.callRemote("get_buckets", storageindex) def _wrap_results(res): return (res, serverid, True) @@ -111,6 +92,38 @@ class Checker(log.PrefixingLogMixin): d.addCallbacks(_wrap_results, _trap_errs) return d + def _add_lease_failed(self, f, peerid, storage_index): + # Older versions of Tahoe didn't handle the add-lease message very + # well: <=1.1.0 throws a NameError because it doesn't implement + # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets + # (which is most of them, since we send add-lease to everybody, + # before we know whether or not they have any shares for us), and + # 1.2.0 throws KeyError even on known buckets due to an internal bug + # in the latency-measuring code. + + # we want to ignore the known-harmless errors and log the others. In + # particular we want to log any local errors caused by coding + # problems. + + if f.check(RemoteException): + if f.value.failure.check(KeyError, IndexError, NameError): + # this may ignore a bit too much, but that only hurts us + # during debugging + return + self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s", + peerid=idlib.shortnodeid_b2a(peerid), + f_value=str(f.value), + failure=f, + level=log.WEIRD, umid="atbAxw") + return + # local errors are cause for alarm + log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s", + peerid=idlib.shortnodeid_b2a(peerid), + f_value=str(f.value), + failure=f, + level=log.WEIRD, umid="hEGuQg") + + def _download_and_verify(self, serverid, sharenum, bucket): """Start an attempt to download and verify every block in this bucket and return a deferred that will eventually fire once the attempt diff --git a/src/allmydata/mutable/servermap.py b/src/allmydata/mutable/servermap.py index 740eb378a..716b1b477 100644 --- a/src/allmydata/mutable/servermap.py +++ b/src/allmydata/mutable/servermap.py @@ -536,32 +536,19 @@ class ServermapUpdater: return d def _do_read(self, ss, peerid, storage_index, shnums, readv): - d = ss.callRemote("slot_readv", storage_index, shnums, readv) if self._add_lease: + # send an add-lease message in parallel. The results are handled + # separately. This is sent before the slot_readv() so that we can + # be sure the add_lease is retired by the time slot_readv comes + # back (this relies upon our knowledge that the server code for + # add_lease is synchronous). renew_secret = self._node.get_renewal_secret(peerid) cancel_secret = self._node.get_cancel_secret(peerid) d2 = ss.callRemote("add_lease", storage_index, renew_secret, cancel_secret) - dl = defer.DeferredList([d, d2], consumeErrors=True) - def _done(res): - [(readv_success, readv_result), - (addlease_success, addlease_result)] = res - # ignore remote IndexError on the add_lease call. Propagate - # local errors and remote non-IndexErrors - if addlease_success: - return readv_result - if not addlease_result.check(RemoteException): - # Propagate local errors - return addlease_result - if addlease_result.value.failure.check(IndexError): - # tahoe=1.3.0 raised IndexError on non-existant - # buckets, which we ignore - return readv_result - # propagate remote errors that aren't IndexError, including - # the unfortunate internal KeyError bug that <1.3.0 had. - return addlease_result - dl.addCallback(_done) - return dl + # we ignore success + d2.addErrback(self._add_lease_failed, peerid, storage_index) + d = ss.callRemote("slot_readv", storage_index, shnums, readv) return d def _got_results(self, datavs, peerid, readsize, stuff, started): @@ -756,6 +743,37 @@ class ServermapUpdater: self._status.set_privkey_from(peerid) + def _add_lease_failed(self, f, peerid, storage_index): + # Older versions of Tahoe didn't handle the add-lease message very + # well: <=1.1.0 throws a NameError because it doesn't implement + # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets + # (which is most of them, since we send add-lease to everybody, + # before we know whether or not they have any shares for us), and + # 1.2.0 throws KeyError even on known buckets due to an internal bug + # in the latency-measuring code. + + # we want to ignore the known-harmless errors and log the others. In + # particular we want to log any local errors caused by coding + # problems. + + if f.check(RemoteException): + if f.value.failure.check(KeyError, IndexError, NameError): + # this may ignore a bit too much, but that only hurts us + # during debugging + return + self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s", + peerid=idlib.shortnodeid_b2a(peerid), + f_value=str(f.value), + failure=f, + level=log.WEIRD, umid="iqg3mw") + return + # local errors are cause for alarm + log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s", + peerid=idlib.shortnodeid_b2a(peerid), + f_value=str(f.value), + failure=f, + level=log.WEIRD, umid="ZWh6HA") + def _query_failed(self, f, peerid): if not self._running: return diff --git a/src/allmydata/test/test_checker.py b/src/allmydata/test/test_checker.py index e52e59b20..21c70ba1a 100644 --- a/src/allmydata/test/test_checker.py +++ b/src/allmydata/test/test_checker.py @@ -4,6 +4,9 @@ from twisted.trial import unittest from allmydata import check_results, uri from allmydata.web import check_results as web_check_results from allmydata.storage_client import StorageFarmBroker, NativeStorageClientDescriptor +from allmydata.monitor import Monitor +from allmydata.test.no_network import GridTestMixin +from allmydata.immutable.upload import Data from common_web import WebRenderingMixin class FakeClient: @@ -269,3 +272,49 @@ class WebResultsRendering(unittest.TestCase, WebRenderingMixin): d.addCallback(_got_lit_results) return d +class AddLease(GridTestMixin, unittest.TestCase): + # test for #875, in which failures in the add-lease call cause + # false-negatives in the checker + + def test_875(self): + self.basedir = "checker/AddLease/875" + self.set_up_grid(num_servers=1) + c0 = self.g.clients[0] + self.uris = {} + DATA = "data" * 100 + d = c0.upload(Data(DATA, convergence="")) + def _stash_immutable(ur): + self.imm = c0.create_node_from_uri(ur.uri) + d.addCallback(_stash_immutable) + d.addCallback(lambda ign: c0.create_mutable_file("contents")) + def _stash_mutable(node): + self.mut = node + d.addCallback(_stash_mutable) + + def _check_cr(cr, which): + self.failUnless(cr.is_healthy(), which) + + # these two should work normally + d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True)) + d.addCallback(_check_cr, "immutable-normal") + d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True)) + d.addCallback(_check_cr, "mutable-normal") + + really_did_break = [] + # now break the server's remote_add_lease call + def _break_add_lease(ign): + def broken_add_lease(*args, **kwargs): + really_did_break.append(1) + raise KeyError("intentional failure, should be ignored") + assert self.g.servers_by_number[0].remote_add_lease + self.g.servers_by_number[0].remote_add_lease = broken_add_lease + d.addCallback(_break_add_lease) + + # and confirm that the files still look healthy + d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True)) + d.addCallback(_check_cr, "mutable-broken") + d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True)) + d.addCallback(_check_cr, "immutable-broken") + + d.addCallback(lambda ign: self.failUnless(really_did_break)) + return d