immutable repairer: populate servers-responding properly

If a server did not respond to the pre-repair filecheck, but did respond
to the repair, that server was not correctly added to the
RepairResults.data["servers-responding"] list. (This resulted from a
buggy usage of DictOfSets.union() in filenode.py).

In addition, servers to which filecheck queries were sent, but did not
respond, were incorrectly added to the servers-responding list
anyawys. (This resulted from code in the checker.py not paying attention
to the 'responded' flag).

The first bug was neatly masked by the second: it's pretty rare to have
a server suddenly start responding in the one-second window between a
filecheck and a subsequent repair, and if the server was around for the
filecheck, you'd never notice the problem. I only spotted the smelly
code while I was changing it for IServer cleanup purposes.

I added coverage to test_repairer.py for this. Trying to get that test
to fail before fixing the first bug is what led me to discover the
second bug. I also had to update test_corrupt_file_verno, since it was
incorrectly asserting that 10 servers responded, when in fact one of
them throws an error (but the second bug was causing it to be reported
anyways).
This commit is contained in:
Brian Warner 2012-05-16 16:50:43 -07:00
parent 5ec20761ed
commit 9acf5beebd
4 changed files with 36 additions and 7 deletions

View File

@ -747,6 +747,7 @@ class Checker(log.PrefixingLogMixin):
servers = {} # {serverid: set(sharenums)}
corruptshare_locators = [] # (serverid, storageindex, sharenum)
incompatibleshare_locators = [] # (serverid, storageindex, sharenum)
servers_responding = set() # serverid
for verified, server, corrupt, incompatible, responded in results:
server_id = server.get_serverid()
@ -757,6 +758,8 @@ class Checker(log.PrefixingLogMixin):
corruptshare_locators.append((server_id, SI, sharenum))
for sharenum in incompatible:
incompatibleshare_locators.append((server_id, SI, sharenum))
if responded:
servers_responding.add(server_id)
d['count-shares-good'] = len(verifiedshares)
d['count-good-share-hosts'] = len([s for s in servers.keys() if servers[s]])
@ -780,7 +783,7 @@ class Checker(log.PrefixingLogMixin):
d['count-recoverable-versions'] = 0
d['count-unrecoverable-versions'] = 1
d['servers-responding'] = list(servers)
d['servers-responding'] = list(servers_responding)
d['sharemap'] = verifiedshares
# no such thing as wrong shares of an immutable file
d['count-wrong-shares'] = 0

View File

@ -119,8 +119,10 @@ class CiphertextFileNode:
assert isinstance(sm, DictOfSets), sm
sm.update(ur.sharemap)
servers_responding = set(prr.data['servers-responding'])
servers_responding.union(ur.sharemap.iterkeys())
prr.data['servers-responding'] = list(servers_responding)
for shnum, serverids in ur.sharemap.items():
servers_responding.update(serverids)
servers_responding = sorted(servers_responding)
prr.data['servers-responding'] = servers_responding
prr.data['count-shares-good'] = len(sm)
good_hosts = len(reduce(set.union, sm.itervalues(), set()))
prr.data['count-good-share-hosts'] = good_hosts

View File

@ -67,6 +67,8 @@ class LocalWrapper:
def _call():
if self.broken:
if self.broken is not True: # a counter, not boolean
self.broken -= 1
raise IntentionalError("I was asked to break")
if self.hung_until:
d2 = defer.Deferred()
@ -299,10 +301,11 @@ class NoNetworkGrid(service.MultiService):
self.rebuild_serverlist()
return ss
def break_server(self, serverid):
def break_server(self, serverid, count=True):
# mark the given server as broken, so it will throw exceptions when
# asked to hold a share or serve a share
self.wrappers_by_id[serverid].broken = True
# asked to hold a share or serve a share. If count= is a number,
# throw that many exceptions before starting to work again.
self.wrappers_by_id[serverid].broken = count
def hang_server(self, serverid):
# hang the given server

View File

@ -170,7 +170,7 @@ class Verifier(GridTestMixin, unittest.TestCase, RepairTestMixin):
self.failUnless(data['count-shares-needed'] == 3, data)
self.failUnless(data['count-shares-expected'] == 10, data)
self.failUnless(data['count-good-share-hosts'] == 9, data)
self.failUnless(len(data['servers-responding']) == 10, data)
self.failUnless(len(data['servers-responding']) == 9, data)
self.failUnless(len(data['list-corrupt-shares']) == 0, data)
def test_corrupt_file_verno(self):
@ -706,6 +706,27 @@ class Repairer(GridTestMixin, unittest.TestCase, RepairTestMixin,
d.addCallback(_check)
return d
def test_servers_responding(self):
self.basedir = "repairer/Repairer/servers_responding"
self.set_up_grid(num_clients=2)
d = self.upload_and_stash()
# now cause one of the servers to not respond during the pre-repair
# filecheck, but then *do* respond to the post-repair filecheck
def _then(ign):
ss = self.g.servers_by_number[0]
self.g.break_server(ss.my_nodeid, count=1)
self.delete_shares_numbered(self.uri, [9])
return self.c0_filenode.check_and_repair(Monitor())
d.addCallback(_then)
def _check(rr):
# this exercises a bug in which the servers-responding list did
# not include servers that responded to the Repair, but which did
# not respond to the pre-repair filecheck
prr = rr.get_post_repair_results()
expected = set(self.g.get_all_serverids())
self.failUnlessEqual(expected, set(prr.data["servers-responding"]))
d.addCallback(_check)
return d
# XXX extend these tests to show that the checker detects which specific
# share on which specific server is broken -- this is necessary so that the