checker: don't let failures in add-lease affect checker results. Closes #875.

Mutable servermap updates and the immutable checker, when run with
add_lease=True, send both the do-you-have-block and add-lease commands in
parallel, to avoid an extra round trip time. Many older servers have problems
with add-lease and raise various exceptions, which don't generally matter.
The client-side code was catching+ignoring some of them, but unrecognized
exceptions were passed through to the DYHB code, concealing the DYHB results
from the checker, making it think the server had no shares.

The fix is to separate the code paths. Both commands are sent at the same
time, but the errback path from add-lease is handled separately. Known
exceptions are ignored, the others (both unknown-remote and all-local) are
logged (log.WEIRD, which will trigger an Incident), but neither will affect
the DYHB results.

The add-lease message is sent first, and we know that the server handles them
synchronously. So when the checker is done, we can be sure that all the
add-lease messages have been retired. This makes life easier for unit tests.
This commit is contained in:
Brian Warner 2009-12-29 15:01:08 -08:00
parent a50fdca6ab
commit 794e32738f
3 changed files with 122 additions and 42 deletions

View File

@ -71,33 +71,14 @@ class Checker(log.PrefixingLogMixin):
that we want to track and report whether or not each server
responded.)"""
d = server.callRemote("get_buckets", storageindex)
if self._add_lease:
renew_secret = self._get_renewal_secret(serverid)
cancel_secret = self._get_cancel_secret(serverid)
d2 = server.callRemote("add_lease", storageindex,
renew_secret, cancel_secret)
dl = defer.DeferredList([d, d2], consumeErrors=True)
def _done(res):
[(get_success, get_result),
(addlease_success, addlease_result)] = res
# ignore remote IndexError on the add_lease call. Propagate
# local errors and remote non-IndexErrors
if addlease_success:
return get_result
if not addlease_result.check(RemoteException):
# Propagate local errors
return addlease_result
if addlease_result.value.failure.check(IndexError):
# tahoe=1.3.0 raised IndexError on non-existant
# buckets, which we ignore
return get_result
# propagate remote errors that aren't IndexError, including
# the unfortunate internal KeyError bug that <1.3.0 had.
return addlease_result
dl.addCallback(_done)
d = dl
d2.addErrback(self._add_lease_failed, serverid, storageindex)
d = server.callRemote("get_buckets", storageindex)
def _wrap_results(res):
return (res, serverid, True)
@ -111,6 +92,38 @@ class Checker(log.PrefixingLogMixin):
d.addCallbacks(_wrap_results, _trap_errs)
return d
def _add_lease_failed(self, f, peerid, storage_index):
# Older versions of Tahoe didn't handle the add-lease message very
# well: <=1.1.0 throws a NameError because it doesn't implement
# remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
# (which is most of them, since we send add-lease to everybody,
# before we know whether or not they have any shares for us), and
# 1.2.0 throws KeyError even on known buckets due to an internal bug
# in the latency-measuring code.
# we want to ignore the known-harmless errors and log the others. In
# particular we want to log any local errors caused by coding
# problems.
if f.check(RemoteException):
if f.value.failure.check(KeyError, IndexError, NameError):
# this may ignore a bit too much, but that only hurts us
# during debugging
return
self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
peerid=idlib.shortnodeid_b2a(peerid),
f_value=str(f.value),
failure=f,
level=log.WEIRD, umid="atbAxw")
return
# local errors are cause for alarm
log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
peerid=idlib.shortnodeid_b2a(peerid),
f_value=str(f.value),
failure=f,
level=log.WEIRD, umid="hEGuQg")
def _download_and_verify(self, serverid, sharenum, bucket):
"""Start an attempt to download and verify every block in this bucket
and return a deferred that will eventually fire once the attempt

View File

@ -536,32 +536,19 @@ class ServermapUpdater:
return d
def _do_read(self, ss, peerid, storage_index, shnums, readv):
d = ss.callRemote("slot_readv", storage_index, shnums, readv)
if self._add_lease:
# send an add-lease message in parallel. The results are handled
# separately. This is sent before the slot_readv() so that we can
# be sure the add_lease is retired by the time slot_readv comes
# back (this relies upon our knowledge that the server code for
# add_lease is synchronous).
renew_secret = self._node.get_renewal_secret(peerid)
cancel_secret = self._node.get_cancel_secret(peerid)
d2 = ss.callRemote("add_lease", storage_index,
renew_secret, cancel_secret)
dl = defer.DeferredList([d, d2], consumeErrors=True)
def _done(res):
[(readv_success, readv_result),
(addlease_success, addlease_result)] = res
# ignore remote IndexError on the add_lease call. Propagate
# local errors and remote non-IndexErrors
if addlease_success:
return readv_result
if not addlease_result.check(RemoteException):
# Propagate local errors
return addlease_result
if addlease_result.value.failure.check(IndexError):
# tahoe=1.3.0 raised IndexError on non-existant
# buckets, which we ignore
return readv_result
# propagate remote errors that aren't IndexError, including
# the unfortunate internal KeyError bug that <1.3.0 had.
return addlease_result
dl.addCallback(_done)
return dl
# we ignore success
d2.addErrback(self._add_lease_failed, peerid, storage_index)
d = ss.callRemote("slot_readv", storage_index, shnums, readv)
return d
def _got_results(self, datavs, peerid, readsize, stuff, started):
@ -756,6 +743,37 @@ class ServermapUpdater:
self._status.set_privkey_from(peerid)
def _add_lease_failed(self, f, peerid, storage_index):
# Older versions of Tahoe didn't handle the add-lease message very
# well: <=1.1.0 throws a NameError because it doesn't implement
# remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
# (which is most of them, since we send add-lease to everybody,
# before we know whether or not they have any shares for us), and
# 1.2.0 throws KeyError even on known buckets due to an internal bug
# in the latency-measuring code.
# we want to ignore the known-harmless errors and log the others. In
# particular we want to log any local errors caused by coding
# problems.
if f.check(RemoteException):
if f.value.failure.check(KeyError, IndexError, NameError):
# this may ignore a bit too much, but that only hurts us
# during debugging
return
self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
peerid=idlib.shortnodeid_b2a(peerid),
f_value=str(f.value),
failure=f,
level=log.WEIRD, umid="iqg3mw")
return
# local errors are cause for alarm
log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
peerid=idlib.shortnodeid_b2a(peerid),
f_value=str(f.value),
failure=f,
level=log.WEIRD, umid="ZWh6HA")
def _query_failed(self, f, peerid):
if not self._running:
return

View File

@ -4,6 +4,9 @@ from twisted.trial import unittest
from allmydata import check_results, uri
from allmydata.web import check_results as web_check_results
from allmydata.storage_client import StorageFarmBroker, NativeStorageClientDescriptor
from allmydata.monitor import Monitor
from allmydata.test.no_network import GridTestMixin
from allmydata.immutable.upload import Data
from common_web import WebRenderingMixin
class FakeClient:
@ -269,3 +272,49 @@ class WebResultsRendering(unittest.TestCase, WebRenderingMixin):
d.addCallback(_got_lit_results)
return d
class AddLease(GridTestMixin, unittest.TestCase):
# test for #875, in which failures in the add-lease call cause
# false-negatives in the checker
def test_875(self):
self.basedir = "checker/AddLease/875"
self.set_up_grid(num_servers=1)
c0 = self.g.clients[0]
self.uris = {}
DATA = "data" * 100
d = c0.upload(Data(DATA, convergence=""))
def _stash_immutable(ur):
self.imm = c0.create_node_from_uri(ur.uri)
d.addCallback(_stash_immutable)
d.addCallback(lambda ign: c0.create_mutable_file("contents"))
def _stash_mutable(node):
self.mut = node
d.addCallback(_stash_mutable)
def _check_cr(cr, which):
self.failUnless(cr.is_healthy(), which)
# these two should work normally
d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
d.addCallback(_check_cr, "immutable-normal")
d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
d.addCallback(_check_cr, "mutable-normal")
really_did_break = []
# now break the server's remote_add_lease call
def _break_add_lease(ign):
def broken_add_lease(*args, **kwargs):
really_did_break.append(1)
raise KeyError("intentional failure, should be ignored")
assert self.g.servers_by_number[0].remote_add_lease
self.g.servers_by_number[0].remote_add_lease = broken_add_lease
d.addCallback(_break_add_lease)
# and confirm that the files still look healthy
d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
d.addCallback(_check_cr, "mutable-broken")
d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
d.addCallback(_check_cr, "immutable-broken")
d.addCallback(lambda ign: self.failUnless(really_did_break))
return d