mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2024-12-20 05:28:04 +00:00
hash trees: further cleanup, to make sure we're validating the right thing
hashtree.py: improve the methods available for finding out which hash nodes are needed. Change set_hashes() to require that every hash provided can be validated up to the root. download.py: validate from the top down, including the URI-derived roothash in the share hash tree, and stashing the thus-validated share hash for use in the block hash tree.
This commit is contained in:
parent
d351cd7674
commit
30133a7cdf
@ -48,48 +48,56 @@ class Output:
|
||||
return self.downloadable.finish()
|
||||
|
||||
class ValidatedBucket:
|
||||
def __init__(self, sharenum, bucket, share_hash_tree, num_blocks):
|
||||
def __init__(self, sharenum, bucket,
|
||||
share_hash_tree, roothash,
|
||||
num_blocks):
|
||||
self.sharenum = sharenum
|
||||
self.bucket = bucket
|
||||
self._share_hash = None # None means not validated yet
|
||||
self.share_hash_tree = share_hash_tree
|
||||
self._roothash = roothash
|
||||
self.block_hash_tree = hashtree.IncompleteHashTree(num_blocks)
|
||||
|
||||
def get_block(self, blocknum):
|
||||
d1 = self.bucket.callRemote('get_block', blocknum)
|
||||
# we might also need to grab some elements of our block hash tree, to
|
||||
# the first time we use this bucket, we need to fetch enough elements
|
||||
# of the share hash tree to validate it from our share hash up to the
|
||||
# hashroot.
|
||||
if not self._share_hash:
|
||||
d1 = self.bucket.callRemote('get_share_hashes')
|
||||
else:
|
||||
d1 = defer.succeed(None)
|
||||
|
||||
# we might need to grab some elements of our block hash tree, to
|
||||
# validate the requested block up to the share hash
|
||||
if self.block_hash_tree.needed_hashes(leaves=[blocknum]):
|
||||
needed = self.block_hash_tree.needed_hashes(blocknum)
|
||||
if needed:
|
||||
# TODO: get fewer hashes, callRemote('get_block_hashes', needed)
|
||||
d2 = self.bucket.callRemote('get_block_hashes')
|
||||
else:
|
||||
d2 = defer.succeed(None)
|
||||
# we might need to grab some elements of the share hash tree to
|
||||
# validate from our share hash up to the hashroot
|
||||
if self.share_hash_tree.needed_hashes(leaves=[self.sharenum]):
|
||||
d3 = self.bucket.callRemote('get_share_hashes')
|
||||
need_to_validate_sharehash = True
|
||||
else:
|
||||
d3 = defer.succeed(None)
|
||||
need_to_validate_sharehash = False
|
||||
d2 = defer.succeed([])
|
||||
|
||||
d3 = self.bucket.callRemote('get_block', blocknum)
|
||||
|
||||
d = defer.gatherResults([d1, d2, d3])
|
||||
d.addCallback(self._got_data, blocknum, need_to_validate_sharehash)
|
||||
d.addCallback(self._got_data, blocknum)
|
||||
return d
|
||||
|
||||
def _got_data(self, res, blocknum, need_to_validate_sharehash):
|
||||
blockdata, blockhashes, sharehashes = res
|
||||
def _got_data(self, res, blocknum):
|
||||
sharehashes, blockhashes, blockdata = res
|
||||
|
||||
if not self._share_hash:
|
||||
sh = dict(sharehashes)
|
||||
sh[0] = self._roothash # always use our own root, from the URI
|
||||
if self.share_hash_tree.get_leaf_index(self.sharenum) not in sh:
|
||||
raise hashutil.NotEnoughHashesError
|
||||
self.share_hash_tree.set_hashes(sh)
|
||||
self._share_hash = self.share_hash_tree.get_leaf(self.sharenum)
|
||||
|
||||
blockhash = hashutil.tagged_hash("encoded subshare", blockdata)
|
||||
# we always validate the blockhash
|
||||
if blockhashes is None:
|
||||
blockhashes = []
|
||||
bh = dict(enumerate(blockhashes))
|
||||
self.block_hash_tree.set_hashes(bh, {blocknum: blockhash},
|
||||
must_validate=True)
|
||||
if need_to_validate_sharehash:
|
||||
# we only need to validate the sharehash once, the first time we
|
||||
# fetch a block
|
||||
sh = dict(sharehashes)
|
||||
sharehash = self.block_hash_tree[0]
|
||||
self.share_hash_tree.set_hashes(sh, {self.sharenum: sharehash},
|
||||
must_validate=True)
|
||||
bh[0] = self._share_hash # replace blockhash root with validated value
|
||||
self.block_hash_tree.set_hashes(bh, {blocknum: blockhash})
|
||||
# If we made it here, the block is good. If the hash trees didn't
|
||||
# like what they saw, they would have raised a BadHashError, causing
|
||||
# our caller to see a Failure and thus ignore this block (as well as
|
||||
@ -237,6 +245,7 @@ class FileDownloader:
|
||||
def add_share_bucket(self, sharenum, bucket):
|
||||
vbucket = ValidatedBucket(sharenum, bucket,
|
||||
self._share_hashtree,
|
||||
self._roothash,
|
||||
self._total_segments)
|
||||
self._share_buckets.setdefault(sharenum, set()).add(vbucket)
|
||||
|
||||
|
@ -252,6 +252,10 @@ class Encoder(object):
|
||||
return sh.callRemote("put_block_hashes", all_hashes)
|
||||
|
||||
def send_all_share_hash_trees(self):
|
||||
# each bucket gets a set of share hash tree nodes that are needed to
|
||||
# validate their share. This includes the share hash itself, but does
|
||||
# not include the top-level hash root (which is stored securely in
|
||||
# the URI instead).
|
||||
log.msg("%s sending all share hash trees" % self)
|
||||
dl = []
|
||||
for h in self.share_root_hashes:
|
||||
@ -264,9 +268,7 @@ class Encoder(object):
|
||||
for i in range(self.num_shares):
|
||||
# the HashTree is given a list of leaves: 0,1,2,3..n .
|
||||
# These become nodes A+0,A+1,A+2.. of the tree, where A=n-1
|
||||
tree_width = roundup_pow2(self.num_shares)
|
||||
base_index = i + tree_width - 1
|
||||
needed_hash_indices = t.needed_for(base_index)
|
||||
needed_hash_indices = t.needed_hashes(i, include_leaf=True)
|
||||
hashes = [(hi, t[hi]) for hi in needed_hash_indices]
|
||||
dl.append(self.send_one_share_hash_tree(i, hashes))
|
||||
return defer.DeferredList(dl)
|
||||
|
@ -156,6 +156,12 @@ class CompleteBinaryTreeMixin:
|
||||
idlib.b2a_or_none(self[i])))
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
def get_leaf_index(self, leafnum):
|
||||
return self.first_leaf_num + leafnum
|
||||
|
||||
def get_leaf(self, leafnum):
|
||||
return self[self.first_leaf_num + leafnum]
|
||||
|
||||
def empty_leaf_hash(i):
|
||||
return tagged_hash('Merkle tree empty leaf', "%d" % i)
|
||||
def pair_hash(a, b):
|
||||
@ -193,6 +199,7 @@ class HashTree(CompleteBinaryTreeMixin, list):
|
||||
# Augment the list.
|
||||
start = len(L)
|
||||
end = roundup_pow2(len(L))
|
||||
self.first_leaf_num = end - 1
|
||||
L = L + [None] * (end - start)
|
||||
for i in range(start, end):
|
||||
L[i] = empty_leaf_hash(i)
|
||||
@ -206,6 +213,36 @@ class HashTree(CompleteBinaryTreeMixin, list):
|
||||
rows.reverse()
|
||||
self[:] = sum(rows, [])
|
||||
|
||||
def needed_hashes(self, leafnum, include_leaf=False):
|
||||
"""Which hashes will someone need to validate a given data block?
|
||||
|
||||
I am used to answer a question: supposing you have the data block
|
||||
that is used to form leaf hash N, and you want to validate that it,
|
||||
which hashes would you need?
|
||||
|
||||
I accept a leaf number and return a set of 'hash index' values, which
|
||||
are integers from 0 to len(self). In the 'hash index' number space,
|
||||
hash[0] is the root hash, while hash[len(self)-1] is the last leaf
|
||||
hash.
|
||||
|
||||
This method can be used to find out which hashes you should request
|
||||
from some untrusted source (usually the same source that provides the
|
||||
data block), so you can minimize storage or transmission overhead. It
|
||||
can also be used to determine which hashes you should send to a
|
||||
remote data store so that it will be able to provide validatable data
|
||||
in the future.
|
||||
|
||||
I will not include '0' (the root hash) in the result, since the root
|
||||
is generally stored somewhere that is more trusted than the source of
|
||||
the remaining hashes. I will include the leaf hash itself only if you
|
||||
ask me to, by passing include_leaf=True.
|
||||
"""
|
||||
|
||||
needed = set(self.needed_for(self.first_leaf_num + leafnum))
|
||||
if include_leaf:
|
||||
needed.add(self.first_leaf_num + leafnum)
|
||||
return needed
|
||||
|
||||
|
||||
class NotEnoughHashesError(Exception):
|
||||
pass
|
||||
@ -250,18 +287,24 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list):
|
||||
rows.reverse()
|
||||
self[:] = sum(rows, [])
|
||||
|
||||
def needed_hashes(self, hashes=[], leaves=[]):
|
||||
hashnums = set(list(hashes))
|
||||
for leafnum in leaves:
|
||||
hashnums.add(self.first_leaf_num + leafnum)
|
||||
maybe_needed = set()
|
||||
for hashnum in hashnums:
|
||||
maybe_needed.update(self.needed_for(hashnum))
|
||||
maybe_needed.add(0) # need the root too
|
||||
|
||||
def needed_hashes(self, leafnum, include_leaf=False):
|
||||
"""Which new hashes do I need to validate a given data block?
|
||||
|
||||
I am much like HashTree.needed_hashes(), except that I don't include
|
||||
hashes that I already know about. When needed_hashes() is called on
|
||||
an empty IncompleteHashTree, it will return the same set as a
|
||||
HashTree of the same size. But later, once hashes have been added
|
||||
with set_hashes(), I will ask for fewer hashes, since some of the
|
||||
necessary ones have already been set.
|
||||
"""
|
||||
|
||||
maybe_needed = set(self.needed_for(self.first_leaf_num + leafnum))
|
||||
if include_leaf:
|
||||
maybe_needed.add(self.first_leaf_num + leafnum)
|
||||
return set([i for i in maybe_needed if self[i] is None])
|
||||
|
||||
|
||||
def set_hashes(self, hashes={}, leaves={}, must_validate=False):
|
||||
def set_hashes(self, hashes={}, leaves={}):
|
||||
"""Add a bunch of hashes to the tree.
|
||||
|
||||
I will validate these to the best of my ability. If I already have a
|
||||
@ -273,15 +316,12 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list):
|
||||
before I was called. If I return successfully, I will remember all
|
||||
those hashes.
|
||||
|
||||
If every hash that was added was validated, I will return True. If
|
||||
some could not be validated because I did not have enough parent
|
||||
hashes, I will return False. As a result, if I am called with both a
|
||||
leaf hash and the root hash was already set, I will return True if
|
||||
and only if the leaf hash could be validated against the root.
|
||||
|
||||
If must_validate is True, I will raise NotEnoughHashesError instead
|
||||
of returning False. If I raise NotEnoughHashesError, I will forget
|
||||
about all the hashes that you tried to add. TODO: really?
|
||||
I insist upon being able to validate all of the hashes that were
|
||||
given to me. If I cannot do this because I'm missing some hashes, I
|
||||
will raise NotEnoughHashesError (and forget about all the hashes that
|
||||
you tried to add). Note that this means that the root hash must
|
||||
either be included in 'hashes', or it must have been provided at some
|
||||
point in the past.
|
||||
|
||||
'leaves' is a dictionary uses 'leaf index' values, which range from 0
|
||||
(the left-most leaf) to num_leaves-1 (the right-most leaf), and form
|
||||
@ -290,28 +330,42 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list):
|
||||
leaf). leaf[i] is the same as hash[num_leaves-1+i].
|
||||
|
||||
The best way to use me is to obtain the root hash from some 'good'
|
||||
channel, then call set_hash(0, root). Then use the 'bad' channel to
|
||||
obtain data block 0 and the corresponding hash chain (a dict with the
|
||||
same hashes that needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4,
|
||||
8:h8} when len(L)=8). Hash the data block to create leaf0. Then
|
||||
call::
|
||||
channel, and use the 'bad' channel to obtain data block 0 and the
|
||||
corresponding hash chain (a dict with the same hashes that
|
||||
needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4, 8:h8} when
|
||||
len(L)=8). Hash the data block to create leaf0, then feed everything
|
||||
into set_hashes() and see if it raises an exception or not::
|
||||
|
||||
good = iht.set_hashes(hashes=hashchain, leaves={0: leaf0})
|
||||
iht = IncompleteHashTree(numleaves)
|
||||
roothash = trusted_channel.get_roothash()
|
||||
otherhashes = untrusted_channel.get_hashes()
|
||||
# otherhashes.keys() should == iht.needed_hashes(leaves=[0])
|
||||
datablock0 = untrusted_channel.get_data(0)
|
||||
leaf0 = HASH(datablock0)
|
||||
# HASH() is probably hashutil.tagged_hash(tag, datablock0)
|
||||
hashes = otherhashes.copy()
|
||||
hashes[0] = roothash # from 'good' channel
|
||||
iht.set_hashes(hashes, leaves={0: leaf0})
|
||||
|
||||
If 'good' is True, the data block was valid. If 'good' is False, the
|
||||
hashchain did not have the right blocks and we don't know whether the
|
||||
data block was good or bad. If set_hashes() raises an exception,
|
||||
either the data was corrupted or one of the received hashes was
|
||||
corrupted.
|
||||
If the set_hashes() call doesn't raise an exception, the data block
|
||||
was valid. If it raises BadHashError, then either the data block was
|
||||
corrupted or one of the received hashes was corrupted.
|
||||
"""
|
||||
|
||||
assert isinstance(hashes, dict)
|
||||
for h in hashes.values():
|
||||
assert isinstance(h, str)
|
||||
assert isinstance(leaves, dict)
|
||||
for h in leaves.values():
|
||||
assert isinstance(h, str)
|
||||
new_hashes = hashes.copy()
|
||||
for leafnum,leafhash in leaves.iteritems():
|
||||
hashnum = self.first_leaf_num + leafnum
|
||||
if hashnum in new_hashes:
|
||||
assert new_hashes[hashnum] == leafhash
|
||||
if new_hashes[hashnum] != leafhash:
|
||||
raise BadHashError("got conflicting hashes in my "
|
||||
"arguments: leaves[%d] != hashes[%d]"
|
||||
% (leafnum, hashnum))
|
||||
new_hashes[hashnum] = leafhash
|
||||
|
||||
added = set() # we'll remove these if the check fails
|
||||
@ -374,15 +428,11 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list):
|
||||
# were we unable to validate any of the new hashes?
|
||||
unvalidated = set(new_hashes.keys()) - reachable
|
||||
if unvalidated:
|
||||
if must_validate:
|
||||
those = ",".join([str(i) for i in sorted(unvalidated)])
|
||||
raise NotEnoughHashesError("unable to validate hashes %s" % those)
|
||||
raise NotEnoughHashesError("unable to validate hashes %s"
|
||||
% those)
|
||||
|
||||
except (BadHashError, NotEnoughHashesError):
|
||||
for i in added:
|
||||
self[i] = None
|
||||
raise
|
||||
|
||||
# if there were hashes that could not be validated, we return False
|
||||
return not unvalidated
|
||||
|
||||
|
@ -13,15 +13,27 @@ def make_tree(numleaves):
|
||||
return ht
|
||||
|
||||
class Complete(unittest.TestCase):
|
||||
def testCreate(self):
|
||||
# try out various sizes
|
||||
def test_create(self):
|
||||
# try out various sizes, since we pad to a power of two
|
||||
ht = make_tree(6)
|
||||
ht = make_tree(8)
|
||||
ht = make_tree(9)
|
||||
ht = make_tree(8)
|
||||
root = ht[0]
|
||||
self.failUnlessEqual(len(root), 32)
|
||||
self.failUnlessEqual(ht.get_leaf(0), tagged_hash("tag", "0"))
|
||||
self.failUnlessRaises(IndexError, ht.get_leaf, 8)
|
||||
self.failUnlessEqual(ht.get_leaf_index(0), 7)
|
||||
|
||||
def testDump(self):
|
||||
def test_needed_hashes(self):
|
||||
ht = make_tree(8)
|
||||
self.failUnlessEqual(ht.needed_hashes(0), set([8, 4, 2]))
|
||||
self.failUnlessEqual(ht.needed_hashes(0, True), set([7, 8, 4, 2]))
|
||||
self.failUnlessEqual(ht.needed_hashes(1), set([7, 4, 2]))
|
||||
self.failUnlessEqual(ht.needed_hashes(7), set([13, 5, 1]))
|
||||
self.failUnlessEqual(ht.needed_hashes(7, False), set([13, 5, 1]))
|
||||
self.failUnlessEqual(ht.needed_hashes(7, True), set([14, 13, 5, 1]))
|
||||
|
||||
def test_dump(self):
|
||||
ht = make_tree(6)
|
||||
expected = [(0,0),
|
||||
(1,1), (3,2), (7,3), (8,3), (4,2), (9,3), (10,3),
|
||||
@ -39,7 +51,16 @@ class Complete(unittest.TestCase):
|
||||
|
||||
class Incomplete(unittest.TestCase):
|
||||
|
||||
def testCheck(self):
|
||||
def test_create(self):
|
||||
ht = hashtree.IncompleteHashTree(6)
|
||||
ht = hashtree.IncompleteHashTree(9)
|
||||
ht = hashtree.IncompleteHashTree(8)
|
||||
self.failUnlessEqual(ht[0], None)
|
||||
self.failUnlessEqual(ht.get_leaf(0), None)
|
||||
self.failUnlessRaises(IndexError, ht.get_leaf, 8)
|
||||
self.failUnlessEqual(ht.get_leaf_index(0), 7)
|
||||
|
||||
def test_check(self):
|
||||
# first create a complete hash tree
|
||||
ht = make_tree(6)
|
||||
# then create a corresponding incomplete tree
|
||||
@ -47,20 +68,23 @@ class Incomplete(unittest.TestCase):
|
||||
|
||||
# suppose we wanted to validate leaf[0]
|
||||
# leaf[0] is the same as node[7]
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[0]), set([8, 4, 2, 0]))
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[1]), set([7, 4, 2, 0]))
|
||||
iht.set_hashes({0: ht[0]}) # set the root
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[0]), set([8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[1]), set([7, 4, 2]))
|
||||
iht.set_hashes({5: ht[5]})
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[0]), set([8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[1]), set([7, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(0, True), set([7, 8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2]))
|
||||
iht[0] = ht[0] # set the root
|
||||
self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2]))
|
||||
iht[5] = ht[5]
|
||||
self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2]))
|
||||
self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2]))
|
||||
|
||||
# reset
|
||||
iht = hashtree.IncompleteHashTree(6)
|
||||
|
||||
current_hashes = list(iht)
|
||||
try:
|
||||
# this should fail because there aren't enough hashes known
|
||||
iht.set_hashes(leaves={0: tagged_hash("tag", "0")},
|
||||
must_validate=True)
|
||||
iht.set_hashes(leaves={0: tagged_hash("tag", "0")})
|
||||
except hashtree.NotEnoughHashesError:
|
||||
pass
|
||||
else:
|
||||
@ -68,48 +92,52 @@ class Incomplete(unittest.TestCase):
|
||||
|
||||
# and the set of hashes stored in the tree should still be the same
|
||||
self.failUnlessEqual(list(iht), current_hashes)
|
||||
# and we should still need the same
|
||||
self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2]))
|
||||
|
||||
# provide the missing hashes
|
||||
iht.set_hashes({2: ht[2], 4: ht[4], 8: ht[8]})
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[0]), set())
|
||||
|
||||
chain = {0: ht[0], 2: ht[2], 4: ht[4], 8: ht[8]}
|
||||
try:
|
||||
# this should fail because the hash is just plain wrong
|
||||
iht.set_hashes(leaves={0: tagged_hash("bad tag", "0")})
|
||||
# this should fail because the leaf hash is just plain wrong
|
||||
iht.set_hashes(chain, leaves={0: tagged_hash("bad tag", "0")})
|
||||
except hashtree.BadHashError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't catch bad hash")
|
||||
|
||||
bad_chain = chain.copy()
|
||||
bad_chain[2] = ht[2] + "BOGUS"
|
||||
|
||||
# this should fail because the internal hash is wrong
|
||||
try:
|
||||
iht.set_hashes(bad_chain, leaves={0: tagged_hash("tag", "0")})
|
||||
except hashtree.BadHashError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't catch bad hash")
|
||||
|
||||
# this should succeed
|
||||
iht.set_hashes(leaves={0: tagged_hash("tag", "0")})
|
||||
try:
|
||||
iht.set_hashes(chain, leaves={0: tagged_hash("tag", "0")})
|
||||
except hashtree.BadHashError, e:
|
||||
self.fail("bad hash: %s" % e)
|
||||
|
||||
try:
|
||||
self.failUnlessEqual(ht.get_leaf(0), tagged_hash("tag", "0"))
|
||||
self.failUnlessRaises(IndexError, ht.get_leaf, 8)
|
||||
|
||||
# this should succeed too
|
||||
try:
|
||||
iht.set_hashes(leaves={1: tagged_hash("tag", "1")})
|
||||
except hashtree.BadHashError:
|
||||
self.fail("bad hash")
|
||||
|
||||
# giving it a bad internal hash should also cause problems
|
||||
iht.set_hashes({13: tagged_hash("bad tag", "x")})
|
||||
try:
|
||||
iht.set_hashes({14: tagged_hash("tag", "14")})
|
||||
except hashtree.BadHashError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't catch bad hash")
|
||||
# undo our damage
|
||||
iht[13] = None
|
||||
# now that leaves 0 and 1 are known, some of the internal nodes are
|
||||
# known
|
||||
self.failUnlessEqual(iht.needed_hashes(4), set([12, 6]))
|
||||
chain = {6: ht[6], 12: ht[12]}
|
||||
|
||||
self.failUnlessEqual(iht.needed_hashes(leaves=[4]), set([12, 6]))
|
||||
|
||||
iht.set_hashes({6: ht[6], 12: ht[12]})
|
||||
try:
|
||||
# this should succeed
|
||||
iht.set_hashes(leaves={4: tagged_hash("tag", "4")})
|
||||
try:
|
||||
iht.set_hashes(chain, leaves={4: tagged_hash("tag", "4")})
|
||||
except hashtree.BadHashError, e:
|
||||
self.fail("bad hash: %s" % e)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user