mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2024-12-19 13:07:56 +00:00
immutable repairer: populate servers-responding properly
If a server did not respond to the pre-repair filecheck, but did respond to the repair, that server was not correctly added to the RepairResults.data["servers-responding"] list. (This resulted from a buggy usage of DictOfSets.union() in filenode.py). In addition, servers to which filecheck queries were sent, but did not respond, were incorrectly added to the servers-responding list anyawys. (This resulted from code in the checker.py not paying attention to the 'responded' flag). The first bug was neatly masked by the second: it's pretty rare to have a server suddenly start responding in the one-second window between a filecheck and a subsequent repair, and if the server was around for the filecheck, you'd never notice the problem. I only spotted the smelly code while I was changing it for IServer cleanup purposes. I added coverage to test_repairer.py for this. Trying to get that test to fail before fixing the first bug is what led me to discover the second bug. I also had to update test_corrupt_file_verno, since it was incorrectly asserting that 10 servers responded, when in fact one of them throws an error (but the second bug was causing it to be reported anyways).
This commit is contained in:
parent
5ec20761ed
commit
9acf5beebd
@ -747,6 +747,7 @@ class Checker(log.PrefixingLogMixin):
|
||||
servers = {} # {serverid: set(sharenums)}
|
||||
corruptshare_locators = [] # (serverid, storageindex, sharenum)
|
||||
incompatibleshare_locators = [] # (serverid, storageindex, sharenum)
|
||||
servers_responding = set() # serverid
|
||||
|
||||
for verified, server, corrupt, incompatible, responded in results:
|
||||
server_id = server.get_serverid()
|
||||
@ -757,6 +758,8 @@ class Checker(log.PrefixingLogMixin):
|
||||
corruptshare_locators.append((server_id, SI, sharenum))
|
||||
for sharenum in incompatible:
|
||||
incompatibleshare_locators.append((server_id, SI, sharenum))
|
||||
if responded:
|
||||
servers_responding.add(server_id)
|
||||
|
||||
d['count-shares-good'] = len(verifiedshares)
|
||||
d['count-good-share-hosts'] = len([s for s in servers.keys() if servers[s]])
|
||||
@ -780,7 +783,7 @@ class Checker(log.PrefixingLogMixin):
|
||||
d['count-recoverable-versions'] = 0
|
||||
d['count-unrecoverable-versions'] = 1
|
||||
|
||||
d['servers-responding'] = list(servers)
|
||||
d['servers-responding'] = list(servers_responding)
|
||||
d['sharemap'] = verifiedshares
|
||||
# no such thing as wrong shares of an immutable file
|
||||
d['count-wrong-shares'] = 0
|
||||
|
@ -119,8 +119,10 @@ class CiphertextFileNode:
|
||||
assert isinstance(sm, DictOfSets), sm
|
||||
sm.update(ur.sharemap)
|
||||
servers_responding = set(prr.data['servers-responding'])
|
||||
servers_responding.union(ur.sharemap.iterkeys())
|
||||
prr.data['servers-responding'] = list(servers_responding)
|
||||
for shnum, serverids in ur.sharemap.items():
|
||||
servers_responding.update(serverids)
|
||||
servers_responding = sorted(servers_responding)
|
||||
prr.data['servers-responding'] = servers_responding
|
||||
prr.data['count-shares-good'] = len(sm)
|
||||
good_hosts = len(reduce(set.union, sm.itervalues(), set()))
|
||||
prr.data['count-good-share-hosts'] = good_hosts
|
||||
|
@ -67,6 +67,8 @@ class LocalWrapper:
|
||||
|
||||
def _call():
|
||||
if self.broken:
|
||||
if self.broken is not True: # a counter, not boolean
|
||||
self.broken -= 1
|
||||
raise IntentionalError("I was asked to break")
|
||||
if self.hung_until:
|
||||
d2 = defer.Deferred()
|
||||
@ -299,10 +301,11 @@ class NoNetworkGrid(service.MultiService):
|
||||
self.rebuild_serverlist()
|
||||
return ss
|
||||
|
||||
def break_server(self, serverid):
|
||||
def break_server(self, serverid, count=True):
|
||||
# mark the given server as broken, so it will throw exceptions when
|
||||
# asked to hold a share or serve a share
|
||||
self.wrappers_by_id[serverid].broken = True
|
||||
# asked to hold a share or serve a share. If count= is a number,
|
||||
# throw that many exceptions before starting to work again.
|
||||
self.wrappers_by_id[serverid].broken = count
|
||||
|
||||
def hang_server(self, serverid):
|
||||
# hang the given server
|
||||
|
@ -170,7 +170,7 @@ class Verifier(GridTestMixin, unittest.TestCase, RepairTestMixin):
|
||||
self.failUnless(data['count-shares-needed'] == 3, data)
|
||||
self.failUnless(data['count-shares-expected'] == 10, data)
|
||||
self.failUnless(data['count-good-share-hosts'] == 9, data)
|
||||
self.failUnless(len(data['servers-responding']) == 10, data)
|
||||
self.failUnless(len(data['servers-responding']) == 9, data)
|
||||
self.failUnless(len(data['list-corrupt-shares']) == 0, data)
|
||||
|
||||
def test_corrupt_file_verno(self):
|
||||
@ -706,6 +706,27 @@ class Repairer(GridTestMixin, unittest.TestCase, RepairTestMixin,
|
||||
d.addCallback(_check)
|
||||
return d
|
||||
|
||||
def test_servers_responding(self):
|
||||
self.basedir = "repairer/Repairer/servers_responding"
|
||||
self.set_up_grid(num_clients=2)
|
||||
d = self.upload_and_stash()
|
||||
# now cause one of the servers to not respond during the pre-repair
|
||||
# filecheck, but then *do* respond to the post-repair filecheck
|
||||
def _then(ign):
|
||||
ss = self.g.servers_by_number[0]
|
||||
self.g.break_server(ss.my_nodeid, count=1)
|
||||
self.delete_shares_numbered(self.uri, [9])
|
||||
return self.c0_filenode.check_and_repair(Monitor())
|
||||
d.addCallback(_then)
|
||||
def _check(rr):
|
||||
# this exercises a bug in which the servers-responding list did
|
||||
# not include servers that responded to the Repair, but which did
|
||||
# not respond to the pre-repair filecheck
|
||||
prr = rr.get_post_repair_results()
|
||||
expected = set(self.g.get_all_serverids())
|
||||
self.failUnlessEqual(expected, set(prr.data["servers-responding"]))
|
||||
d.addCallback(_check)
|
||||
return d
|
||||
|
||||
# XXX extend these tests to show that the checker detects which specific
|
||||
# share on which specific server is broken -- this is necessary so that the
|
||||
|
Loading…
Reference in New Issue
Block a user